Skip to content

Commit

Permalink
add Python implementation of Editops/Opcodes
Browse files Browse the repository at this point in the history
  • Loading branch information
maxbachmann committed Oct 6, 2022
1 parent aa6a88f commit 773a45e
Show file tree
Hide file tree
Showing 73 changed files with 2,188 additions and 1,638 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/releasebuild.yml
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ jobs:
fail-fast: false
matrix:
arch: [auto, aarch64, ppc64le, s390x]
python_tag: [ "cp37-*", "cp38-*", "cp39-*", "cp310-*", "cp311-*", "pp37-*", "pp38-*", "pp39-*"]
python_tag: ["cp37-*", "cp38-*", "cp39-*", "cp310-*", "cp311-*", "pp37-*", "pp38-*", "pp39-*"]
exclude:
# PyPy builds not available for these platforms
- arch: ppc64le
Expand Down
24 changes: 24 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,21 @@ repos:
- flake8-bugbear
- pep8-naming

# Flake8 also supports pre-commit natively (same author)
- repo: https://github.com/PyCQA/flake8
rev: "5.0.4"
hooks:
- id: flake8
exclude: ^(docs/.*|tools/.*)$
additional_dependencies: *flake8_dependencies

# PyLint has native support - not always usable, but works for us
- repo: https://github.com/PyCQA/pylint
rev: "v2.15.3"
hooks:
- id: pylint
files: ^pybind11

# CMake formatting
- repo: https://github.com/cheshirekow/cmake-format-precommit
rev: "v0.6.13"
Expand All @@ -106,6 +121,15 @@ repos:
types: [file]
files: (\.cmake|CMakeLists.txt)(.in)?$

# Check static types with mypy
#- repo: https://github.com/pre-commit/mirrors-mypy
# rev: "v0.971"
# hooks:
# - id: mypy
# args: []
# exclude: ^(tests|docs)/
# additional_dependencies: [nox, rich]

# Checks the manifest for missing files (native support)
- repo: https://github.com/mgedmin/check-manifest
rev: "0.48"
Expand Down
16 changes: 9 additions & 7 deletions bench/benchmark_fuzz.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# todo combine benchmarks of scorers into common code base
import timeit

import numpy as np
import pandas


Expand Down Expand Up @@ -63,7 +62,7 @@ def scorer_benchmark(funcname):
# token_ratio is unique to RapidFuzz
time_token_ratio = benchmark(
"token_ratio",
f"[rfuzz.token_ratio(a, b, processor=None) for b in b_list]",
"[rfuzz.token_ratio(a, b, processor=None) for b in b_list]",
setup,
lengths,
count,
Expand All @@ -72,7 +71,7 @@ def scorer_benchmark(funcname):
# this gets very slow, so only benchmark it for smaller values
time_token_ratio_simple = benchmark(
"fuzzywuzzy",
f"[max(rfuzz.token_sort_ratio(a, b, processor=None), rfuzz.token_set_ratio(a, b, processor=None)) for b in b_list]",
"[max(rfuzz.token_sort_ratio(a, b, processor=None), rfuzz.token_set_ratio(a, b, processor=None)) for b in b_list]",
setup,
lengths,
count,
Expand All @@ -86,12 +85,12 @@ def scorer_benchmark(funcname):
}
)

df.to_csv(f"results/token_ratio.csv", sep=",", index=False)
df.to_csv("results/token_ratio.csv", sep=",", index=False)

# partial_token_ratio is unique to RapidFuzz
time_partial_token_ratio = benchmark(
"token_ratio",
f"[rfuzz.partial_token_ratio(a, b, processor=None) for b in b_list]",
"[rfuzz.partial_token_ratio(a, b, processor=None) for b in b_list]",
setup,
lengths,
count,
Expand All @@ -100,7 +99,10 @@ def scorer_benchmark(funcname):
# this gets very slow, so only benchmark it for smaller values
time_partial_token_ratio_simple = benchmark(
"fuzzywuzzy",
f"[max(rfuzz.partial_token_sort_ratio(a, b, processor=None), rfuzz.partial_token_set_ratio(a, b, processor=None)) for b in b_list]",
(
"[max(rfuzz.partial_token_sort_ratio(a, b, processor=None), "
"rfuzz.partial_token_set_ratio(a, b, processor=None)) for b in b_list]"
),
setup,
lengths,
count,
Expand All @@ -114,4 +116,4 @@ def scorer_benchmark(funcname):
}
)

df.to_csv(f"results/partial_token_ratio.csv", sep=",", index=False)
df.to_csv("results/partial_token_ratio.csv", sep=",", index=False)
33 changes: 33 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,36 @@ requires = [
]
build-backend = "backend"
backend-path = ["_custom_build"]

[tool.isort]
profile = "black"

[tool.mypy]
files = "src"
python_version = "3.7"
strict = true
show_error_codes = true
enable_error_code = ["ignore-without-code", "redundant-expr", "truthy-bool"]
warn_unreachable = true

[tool.pytest.ini_options]
minversion = "6.0"
addopts = ["-ra", "--showlocals", "--strict-markers", "--strict-config"]
xfail_strict = true
filterwarnings = ["error"]
log_cli_level = "info"
testpaths = ["tests"]

[tool.pylint]
master.py-version = "3.6"
reports.output-format = "colorized"
messages_control.disable = [
"design",
"fixme",
"imports",
"line-too-long",
"imports",
"invalid-name",
"protected-access",
"missing-module-docstring",
]
6 changes: 6 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[flake8]
max-line-length = 120
show_source = True
exclude = .git, __pycache__, build, dist, docs, tools, venv
extend-ignore = E203, E722, B903, B950, N801, N802, N806
extend-select = B9
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def run_setup(with_binary):
else:
try:
run_setup(True)
except:
except BaseException:
show_message(
"WARNING: The C extension could not be compiled, speedups"
" are not enabled.",
Expand Down
42 changes: 31 additions & 11 deletions src/rapidfuzz/_utils.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,33 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann

from __future__ import annotations

def _GetScorerFlagsDistance(**kwargs):
from typing import Any, Callable


def _get_scorer_flags_distance(**_kwargs: Any) -> dict[str, Any]:
return {"optimal_score": 0, "worst_score": 2**63 - 1, "flags": (1 << 6)}


def _GetScorerFlagsSimilarity(**kwargs):
def _get_scorer_flags_similarity(**_kwargs: Any) -> dict[str, Any]:
return {"optimal_score": 2**63 - 1, "worst_score": 0, "flags": (1 << 6)}


def _GetScorerFlagsNormalizedDistance(**kwargs):
def _get_scorer_flags_normalized_distance(**_kwargs: Any) -> dict[str, Any]:
return {"optimal_score": 0, "worst_score": 1, "flags": (1 << 5)}


def _GetScorerFlagsNormalizedSimilarity(**kwargs):
def _get_scorer_flags_normalized_similarity(**_kwargs: Any) -> dict[str, Any]:
return {"optimal_score": 1, "worst_score": 0, "flags": (1 << 5)}


def fallback_import(module: str, name: str, set_attrs: bool = True):
def fallback_import(
module: str,
name: str,
cached_scorer_call: dict[str, Callable[..., dict[str, Any]]] | None = None,
set_attrs: bool = True,
) -> Any:
"""
import library function and possibly fall back to a pure Python version
when no C++ implementation is available
Expand All @@ -35,6 +44,9 @@ def fallback_import(module: str, name: str, set_attrs: bool = True):
f"cannot import name '{name}' from '{py_mod.__name}' ({py_mod.__file__})"
)

if cached_scorer_call:
py_func._RF_ScorerPy = cached_scorer_call

if impl == "cpp":
cpp_mod = importlib.import_module(module + "_cpp")
elif impl == "python":
Expand All @@ -55,14 +67,22 @@ def fallback_import(module: str, name: str, set_attrs: bool = True):
if set_attrs:
cpp_func.__name__ = py_func.__name__
cpp_func.__doc__ = py_func.__doc__

if cached_scorer_call:
cpp_func._RF_ScorerPy = cached_scorer_call

return cpp_func


default_distance_attribute = {"get_scorer_flags": _GetScorerFlagsDistance}
default_similarity_attribute = {"get_scorer_flags": _GetScorerFlagsSimilarity}
default_normalized_distance_attribute = {
"get_scorer_flags": _GetScorerFlagsNormalizedDistance
default_distance_attribute: dict[str, Callable[..., dict[str, Any]]] = {
"get_scorer_flags": _get_scorer_flags_distance
}
default_similarity_attribute: dict[str, Callable[..., dict[str, Any]]] = {
"get_scorer_flags": _get_scorer_flags_similarity
}
default_normalized_distance_attribute: dict[str, Callable[..., dict[str, Any]]] = {
"get_scorer_flags": _get_scorer_flags_normalized_distance
}
default_normalized_similarity_attribute = {
"get_scorer_flags": _GetScorerFlagsNormalizedSimilarity
default_normalized_similarity_attribute: dict[str, Callable[..., dict[str, Any]]] = {
"get_scorer_flags": _get_scorer_flags_normalized_similarity
}
7 changes: 0 additions & 7 deletions src/rapidfuzz/_utils.pyi

This file was deleted.

19 changes: 13 additions & 6 deletions src/rapidfuzz/cpp_common.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,25 @@
# cython: language_level=3, binding=True, linetrace=True

from cpython.object cimport PyObject
from cpython.pycapsule cimport (PyCapsule_GetPointer, PyCapsule_IsValid,
PyCapsule_New)
from cpython.pycapsule cimport PyCapsule_GetPointer, PyCapsule_IsValid, PyCapsule_New
from libc.stddef cimport wchar_t
from libc.stdint cimport int64_t, uint64_t
from libc.stdlib cimport free, malloc
from libcpp cimport bool
from libcpp.utility cimport move, pair
from libcpp.vector cimport vector
from rapidfuzz_capi cimport (SCORER_STRUCT_VERSION, RF_GetScorerFlags,
RF_Kwargs, RF_KwargsInit, RF_Preprocessor,
RF_Scorer, RF_ScorerFlags, RF_ScorerFuncInit,
RF_String, RF_StringType)
from rapidfuzz_capi cimport (
SCORER_STRUCT_VERSION,
RF_GetScorerFlags,
RF_Kwargs,
RF_KwargsInit,
RF_Preprocessor,
RF_Scorer,
RF_ScorerFlags,
RF_ScorerFuncInit,
RF_String,
RF_StringType,
)

from array import array

Expand Down
19 changes: 10 additions & 9 deletions src/rapidfuzz/distance/DamerauLevenshtein.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann

from __future__ import annotations

from rapidfuzz._utils import default_distance_attribute as _dist_attr
from rapidfuzz._utils import default_normalized_distance_attribute as _norm_dist_attr
from rapidfuzz._utils import default_normalized_similarity_attribute as _norm_sim_attr
from rapidfuzz._utils import default_similarity_attribute as _sim_attr
from rapidfuzz._utils import fallback_import as _fallback_import

_mod = "rapidfuzz.distance.DamerauLevenshtein"
distance = _fallback_import(_mod, "distance")
similarity = _fallback_import(_mod, "similarity")
normalized_distance = _fallback_import(_mod, "normalized_distance")
normalized_similarity = _fallback_import(_mod, "normalized_similarity")

distance._RF_ScorerPy = _dist_attr
similarity._RF_ScorerPy = _sim_attr
normalized_distance._RF_ScorerPy = _norm_dist_attr
normalized_similarity._RF_ScorerPy = _norm_sim_attr
distance = _fallback_import(_mod, "distance", cached_scorer_call=_dist_attr)
similarity = _fallback_import(_mod, "similarity", cached_scorer_call=_sim_attr)
normalized_distance = _fallback_import(
_mod, "normalized_distance", cached_scorer_call=_norm_dist_attr
)
normalized_similarity = _fallback_import(
_mod, "normalized_similarity", cached_scorer_call=_norm_sim_attr
)
59 changes: 23 additions & 36 deletions src/rapidfuzz/distance/DamerauLevenshtein.pyi
Original file line number Diff line number Diff line change
@@ -1,48 +1,35 @@
from typing import Any, Callable, Dict, Hashable, Optional, Sequence, TypeVar
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann

from typing_extensions import Protocol
from __future__ import annotations

from rapidfuzz.distance import Editops, Opcodes
from typing import Callable, Hashable, Sequence

class _ScorerAttributes(Protocol):
_RF_ScorerPy: Dict

def _attr_decorator(func: Any) -> _ScorerAttributes:
return func

_StringType = Sequence[Hashable]
_S1 = TypeVar("_S1")
_S2 = TypeVar("_S2")

@_attr_decorator
def distance(
s1: _S1,
s2: _S2,
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: Optional[Callable[..., _StringType]] = None,
score_cutoff: Optional[int] = None
processor: Callable[..., Sequence[Hashable]] | None = None,
score_cutoff: int | None = None,
) -> int: ...
@_attr_decorator
def normalized_distance(
s1: _S1,
s2: _S2,
*,
processor: Optional[Callable[..., _StringType]] = None,
score_cutoff: Optional[float] = 0
) -> float: ...
@_attr_decorator
def similarity(
s1: _S1,
s2: _S2,
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: Optional[Callable[..., _StringType]] = None,
score_cutoff: Optional[int] = None
processor: Callable[..., Sequence[Hashable]] | None = None,
score_cutoff: int | None = None,
) -> int: ...
@_attr_decorator
def normalized_distance(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: Callable[..., Sequence[Hashable]] | None = None,
score_cutoff: float | None = None,
) -> float: ...
def normalized_similarity(
s1: _S1,
s2: _S2,
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: Optional[Callable[..., _StringType]] = None,
score_cutoff: Optional[float] = 0
processor: Callable[..., Sequence[Hashable]] | None = None,
score_cutoff: float | None = None,
) -> float: ...
Loading

0 comments on commit 773a45e

Please sign in to comment.