Skip to content

Commit 248b13c

Browse files
authored
fix(tests): Add python bindings to the test coverage report (#58)
* feat(test): Improved test coverage for edge cases and errors * feat(test): More test coverage for measures * feat(ci): added dependabot config
1 parent baeb25f commit 248b13c

File tree

9 files changed

+626
-42
lines changed

9 files changed

+626
-42
lines changed

.github/PULL_REQUEST_TEMPLATE.md

Lines changed: 4 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -7,38 +7,20 @@ assignees: ""
77

88
---
99

10-
**Description**
10+
## Description
1111

1212
<!-- Please include a summary of the change and which issue is fixed. Please also include relevant motivation and context. List any dependencies that are required for this change. -->
1313

14-
**Fixes** (issue)
14+
## Fixes (issue)
1515

16-
**Type of change**
17-
18-
<!-- Please update the title of your PR to match the type of change. The title will be used for the commit message and the changelog. -->
19-
<!---->
20-
<!-- The `(scope)` is optional and refers to the part of the codebase you're changing (e.g., `feat(search)`, `fix(ci)`). -->
21-
22-
<!-- - [ ] `fix`: A bug fix -->
23-
<!-- - [ ] `feat`: A new feature -->
24-
<!-- - [ ] `feat!`: A breaking change -->
25-
<!-- - [ ] `docs`: Documentation only changes -->
26-
<!-- - [ ] `chore`: Changes to the build process or auxiliary tools -->
27-
<!-- - [ ] `refactor`: A code change that neither fixes a bug nor adds a feature -->
28-
<!-- - [ ] `perf`: A code change that improves performance -->
29-
<!-- - [ ] `test`: Adding missing tests or correcting existing tests -->
30-
<!-- - [ ] `style`: Changes that do not affect the meaning of the code -->
31-
<!-- - [ ] `ci`: Changes to our CI configuration files and scripts -->
32-
<!-- - [ ] `revert`: Reverts a previous commit -->
33-
34-
**How Has This Been Tested?**
16+
## How Has This Been Tested?
3517

3618
<!-- Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce. Please also list any relevant details for your test configuration. -->
3719

3820
- [ ] Test A
3921
- [ ] Test B
4022

41-
**Checklist:**
23+
## Checklist
4224

4325
- [ ] My code follows the style guidelines of this project
4426
- [ ] I have performed a self-review of my own code

.github/dependabot.yml

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
version: 2
2+
updates:
3+
# Rust dependencies (Cargo)
4+
- package-ecosystem: "cargo"
5+
directory: "/"
6+
schedule:
7+
interval: "weekly"
8+
commit-message:
9+
prefix: "chore(deps)"
10+
labels:
11+
- "dependencies"
12+
- "rust"
13+
14+
# Python dependencies (pip/pyproject.toml)
15+
- package-ecosystem: "pip"
16+
directory: "/"
17+
schedule:
18+
interval: "weekly"
19+
commit-message:
20+
prefix: "chore(deps)"
21+
labels:
22+
- "dependencies"
23+
- "python"
24+
25+
# GitHub Actions
26+
- package-ecosystem: "github-actions"
27+
directory: "/"
28+
schedule:
29+
interval: "weekly"
30+
commit-message:
31+
prefix: "chore(deps)"
32+
labels:
33+
- "dependencies"
34+
- "ci"

.github/workflows/coverage.yml

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,6 @@ jobs:
2121
target/
2222
key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
2323

24-
- name: Install system dependencies
25-
run: |
26-
sudo apt-get update
27-
sudo apt-get install -y software-properties-common python3-dev
28-
2924
- name: Set up Python
3025
uses: actions/setup-python@v6
3126
with:
@@ -48,21 +43,22 @@ jobs:
4843
# Set the path for the raw coverage data
4944
export LLVM_PROFILE_FILE="target/coverage/simstring_rs-%p-%m.profraw"
5045
# Run all tests, including the ignored python bindings
51-
cargo test --all-features -- --include-ignored
52-
# Generate the coverage report
53-
grcov . --binary-path ./target/debug/ -s . -t lcov --branch --ignore-not-existing --ignore "tests/*" --ignore "examples/*" --ignore "build.rs" --ignore "src/lib.rs" -o ./coverage.lcov
54-
grcov . --binary-path ./target/debug/ -s . -t cobertura --branch --ignore-not-existing --ignore "tests/*" --ignore "examples/*" --ignore "build.rs" --ignore "src/lib.rs" -o ./coverage/cobertura.xml
46+
cargo test --all-features
5547
# Generate Python coverage report
5648
pip install uv
5749
uv venv
5850
source .venv/bin/activate
5951
uv pip install maturin pytest coverage
6052
rm -rf target/wheels
61-
maturin build --release
62-
uv pip install target/wheels/*.whl
53+
# Build and install in editable mode for coverage mapping
54+
maturin develop
6355
coverage run -m pytest tests/python/ -vv
6456
coverage xml -o coverage/python-coverage.xml
6557
58+
# Generate the coverage report
59+
grcov . --binary-path ./target/debug/ -s . -t lcov --branch --ignore-not-existing --ignore "tests/*" --ignore "examples/*" --ignore "**/build.rs" --ignore "**/src/lib.rs" -o ./coverage.lcov
60+
grcov . --binary-path ./target/debug/ -s . -t cobertura --branch --ignore-not-existing --ignore "tests/*" --ignore "examples/*" --ignore "**/build.rs" --ignore "**/src/lib.rs" -o ./coverage/cobertura.xml
61+
6662
- name: Upload coverage to Codecov
6763
uses: codecov/codecov-action@v5
6864
with:

tests/python/test_bindings.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,3 +108,36 @@ def apply(self, text: str):
108108
results = searcher.search("foo", 0.8)
109109

110110
assert results == ["foo"]
111+
112+
def test_word_ngram_edge_cases(self):
113+
# Empty string
114+
extractor = WordNgrams(n=2, splitter=" ", padder="#")
115+
features = extractor.apply("")
116+
# With n=2 and 1 padding on each side, we get ["# #"] -> ["# #1"]
117+
assert features == ["# #1"]
118+
119+
# String with only separators
120+
features_sep = extractor.apply(" ")
121+
assert features_sep == ["# #1"]
122+
123+
# Different splitter
124+
extractor_comma = WordNgrams(n=2, splitter=",", padder="#")
125+
features_comma = extractor_comma.apply("foo,bar")
126+
expected_comma = ["# foo1", "foo bar1", "bar #1"]
127+
assert Counter(features_comma) == Counter(expected_comma)
128+
129+
def test_word_ngrams_in_db(self):
130+
extractor = WordNgrams(n=2, splitter=" ", padder="#")
131+
db = HashDb(extractor)
132+
db.insert("foo bar")
133+
searcher = Searcher(db, Cosine())
134+
results = searcher.search("foo bar", 1.0)
135+
assert results == ["foo bar"]
136+
137+
def test_invalid_extractor_in_db(self):
138+
with pytest.raises(TypeError, match="Extractor must be CharacterNgrams, WordNgrams, or CustomExtractor"):
139+
HashDb("not an extractor")
140+
141+
def test_ranked_search_error_on_invalid_threshold(self):
142+
with pytest.raises(SearchError, match=r"Invalid threshold: 1\.1"):
143+
self.searcher.ranked_search("test", 1.1)

tests/python/test_errors.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import pytest
2+
import multiprocessing
3+
import sys
4+
from simstring_rust.extractors import CustomExtractor
5+
from simstring_rust.database import HashDb
6+
7+
def run_crashing_extractor(): # pragma: no cover
8+
class CrashingExtractor:
9+
def apply(self, text):
10+
raise ValueError("Crash!")
11+
12+
extractor = CustomExtractor(CrashingExtractor())
13+
db = HashDb(extractor)
14+
# This should panic the Rust side because of the unhandled exception in the callback
15+
db.insert("foo")
16+
17+
def test_custom_extractor_panic():
18+
# Run the crashing code in a separate process
19+
p = multiprocessing.Process(target=run_crashing_extractor)
20+
p.start()
21+
p.join()
22+
23+
# Check if the process exited with an error (panic usually causes non-zero exit code)
24+
assert p.exitcode != 0
25+
26+
def test_custom_extractor_missing_apply():
27+
class BadExtractor:
28+
pass
29+
30+
with pytest.raises(TypeError, match="Custom extractor must provide an apply"):
31+
CustomExtractor(BadExtractor())

tests/python/test_measures.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
import pytest
2+
from simstring_rust.database import HashDb
3+
from simstring_rust.extractors import CharacterNgrams
4+
from simstring_rust.measures import Dice, Jaccard, Overlap, ExactMatch
5+
from simstring_rust.searcher import Searcher
6+
7+
class TestMeasures:
8+
def setup_method(self):
9+
self.extractor = CharacterNgrams(n=2, endmarker="$")
10+
self.db = HashDb(self.extractor)
11+
self.db.insert("foo")
12+
self.db.insert("bar")
13+
self.db.insert("fooo")
14+
15+
def test_dice(self):
16+
searcher = Searcher(self.db, Dice())
17+
results = searcher.ranked_search("foo", 0.8)
18+
# "foo" (4 features) vs "foo" (4 features) -> 2*4 / (4+4) = 1.0
19+
# "foo" vs "fooo" (5 features) -> intersect is 4 ($f, fo, oo, o$) -> 2*4 / (4+5) = 8/9 ~= 0.88
20+
assert len(results) == 2
21+
assert results[0][0] == "foo"
22+
assert results[0][1] == pytest.approx(1.0)
23+
assert results[1][0] == "fooo"
24+
assert results[1][1] == pytest.approx(0.88888888)
25+
26+
def test_jaccard(self):
27+
searcher = Searcher(self.db, Jaccard())
28+
results = searcher.ranked_search("foo", 0.8)
29+
# "foo" vs "foo" -> 1.0
30+
# "foo" vs "fooo" -> 4 / 5 = 0.8
31+
assert len(results) == 2
32+
assert results[0][0] == "foo"
33+
assert results[0][1] == pytest.approx(1.0)
34+
assert results[1][0] == "fooo"
35+
assert results[1][1] == pytest.approx(0.8)
36+
37+
def test_overlap(self):
38+
searcher = Searcher(self.db, Overlap())
39+
results = searcher.ranked_search("foo", 0.8)
40+
41+
assert len(results) == 2
42+
assert results[0][0] == "foo"
43+
assert results[0][1] == pytest.approx(1.0)
44+
assert results[1][0] == "fooo"
45+
assert results[1][1] == pytest.approx(1.0)
46+
47+
def test_exact_match(self):
48+
searcher = Searcher(self.db, ExactMatch())
49+
results = searcher.ranked_search("foo", 1.0)
50+
assert len(results) == 1
51+
assert results[0][0] == "foo"
52+
assert results[0][1] == pytest.approx(1.0)
53+
54+
results_partial = searcher.ranked_search("foo", 0.5)
55+
assert len(results_partial) == 1
56+
assert results_partial[0][0] == "foo"

tests/test_features.rs

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,3 +229,30 @@ mod word_ngrams_tests {
229229
);
230230
}
231231
}
232+
233+
#[test]
234+
fn test_character_ngrams_input_shorter_than_n() {
235+
// The condition `total_len < self.n` is only reachable if `text_len + 2*(n-1) < n`.
236+
// This simplifies to `text_len + n < 2`.
237+
// This is only possible if n=1 and text_len=0.
238+
// For any n >= 2, the padding ensures total_len >= n.
239+
240+
let extractor = CharacterNgrams::new(1, "$");
241+
let mut interner = Rodeo::default();
242+
243+
// "" -> len 0. n=1. padding=0. total_len=0. 0 < 1.
244+
let features = extractor.features("", &mut interner);
245+
assert!(
246+
features.is_empty(),
247+
"Features should be empty when input length is shorter than n (and n=1)"
248+
);
249+
}
250+
251+
#[test]
252+
fn test_word_ngrams_n_zero() {
253+
let extractor = WordNgrams::new(0, " ", "#");
254+
let mut interner = Rodeo::default();
255+
256+
let features = extractor.features("hello world", &mut interner);
257+
assert!(features.is_empty(), "Features should be empty when n=0");
258+
}

0 commit comments

Comments
 (0)