fix(tests): Add python bindings to the test coverage report (#58)

PyDataBlog · web-flow · commit 248b13c452d8 · 2026-01-27T12:38:15.000+01:00
* feat(test): Improved test coverage for edge cases and errors

* feat(test): More test coverage for measures

* feat(ci): added dependabot config
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -7,38 +7,20 @@ assignees: ""
 
 ---
 
-**Description**
+## Description
 
 <!-- Please include a summary of the change and which issue is fixed. Please also include relevant motivation and context. List any dependencies that are required for this change. -->
 
-**Fixes** (issue)
+## Fixes (issue)
 
-**Type of change**
-
-<!-- Please update the title of your PR to match the type of change. The title will be used for the commit message and the changelog. -->
-<!---->
-<!-- The `(scope)` is optional and refers to the part of the codebase you're changing (e.g., `feat(search)`, `fix(ci)`). -->
-
-<!-- - [ ] `fix`: A bug fix -->
-<!-- - [ ] `feat`: A new feature -->
-<!-- - [ ] `feat!`: A breaking change -->
-<!-- - [ ] `docs`: Documentation only changes -->
-<!-- - [ ] `chore`: Changes to the build process or auxiliary tools -->
-<!-- - [ ] `refactor`: A code change that neither fixes a bug nor adds a feature -->
-<!-- - [ ] `perf`: A code change that improves performance -->
-<!-- - [ ] `test`: Adding missing tests or correcting existing tests -->
-<!-- - [ ] `style`: Changes that do not affect the meaning of the code -->
-<!-- - [ ] `ci`: Changes to our CI configuration files and scripts -->
-<!-- - [ ] `revert`: Reverts a previous commit -->
-
-**How Has This Been Tested?**
+## How Has This Been Tested?
 
 <!-- Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce. Please also list any relevant details for your test configuration. -->
 
 - [ ] Test A
 - [ ] Test B
 
-**Checklist:**
+## Checklist
 
 - [ ] My code follows the style guidelines of this project
 - [ ] I have performed a self-review of my own code
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -0,0 +1,34 @@
+version: 2
+updates:
+  # Rust dependencies (Cargo)
+  - package-ecosystem: "cargo"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+    commit-message:
+      prefix: "chore(deps)"
+    labels:
+      - "dependencies"
+      - "rust"
+
+  # Python dependencies (pip/pyproject.toml)
+  - package-ecosystem: "pip"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+    commit-message:
+      prefix: "chore(deps)"
+    labels:
+      - "dependencies"
+      - "python"
+
+  # GitHub Actions
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+    commit-message:
+      prefix: "chore(deps)"
+    labels:
+      - "dependencies"
+      - "ci"
diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
@@ -21,11 +21,6 @@ jobs:
             target/
           key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
 
-      - name: Install system dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y software-properties-common python3-dev
-
       - name: Set up Python
         uses: actions/setup-python@v6
         with:
@@ -48,21 +43,22 @@ jobs:
           # Set the path for the raw coverage data
           export LLVM_PROFILE_FILE="target/coverage/simstring_rs-%p-%m.profraw"
           # Run all tests, including the ignored python bindings
-          cargo test --all-features -- --include-ignored
-          # Generate the coverage report
-          grcov . --binary-path ./target/debug/ -s . -t lcov --branch --ignore-not-existing --ignore "tests/*" --ignore "examples/*" --ignore "build.rs" --ignore "src/lib.rs" -o ./coverage.lcov
-          grcov . --binary-path ./target/debug/ -s . -t cobertura --branch --ignore-not-existing --ignore "tests/*" --ignore "examples/*" --ignore "build.rs" --ignore "src/lib.rs" -o ./coverage/cobertura.xml
+          cargo test --all-features
           # Generate Python coverage report
           pip install uv
           uv venv
           source .venv/bin/activate
           uv pip install maturin pytest coverage
           rm -rf target/wheels
-          maturin build --release
-          uv pip install target/wheels/*.whl
+          # Build and install in editable mode for coverage mapping
+          maturin develop
           coverage run -m pytest tests/python/ -vv
           coverage xml -o coverage/python-coverage.xml
 
+          # Generate the coverage report
+          grcov . --binary-path ./target/debug/ -s . -t lcov --branch --ignore-not-existing --ignore "tests/*" --ignore "examples/*" --ignore "**/build.rs" --ignore "**/src/lib.rs" -o ./coverage.lcov
+          grcov . --binary-path ./target/debug/ -s . -t cobertura --branch --ignore-not-existing --ignore "tests/*" --ignore "examples/*" --ignore "**/build.rs" --ignore "**/src/lib.rs" -o ./coverage/cobertura.xml
+
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v5
         with:
diff --git a/tests/python/test_bindings.py b/tests/python/test_bindings.py
@@ -108,3 +108,36 @@ def apply(self, text: str):
         results = searcher.search("foo", 0.8)
 
         assert results == ["foo"]
+
+    def test_word_ngram_edge_cases(self):
+        # Empty string
+        extractor = WordNgrams(n=2, splitter=" ", padder="#")
+        features = extractor.apply("")
+        # With n=2 and 1 padding on each side, we get ["# #"] -> ["# #1"]
+        assert features == ["# #1"]
+
+        # String with only separators
+        features_sep = extractor.apply("   ")
+        assert features_sep == ["# #1"]
+
+        # Different splitter
+        extractor_comma = WordNgrams(n=2, splitter=",", padder="#")
+        features_comma = extractor_comma.apply("foo,bar")
+        expected_comma = ["# foo1", "foo bar1", "bar #1"]
+        assert Counter(features_comma) == Counter(expected_comma)
+
+    def test_word_ngrams_in_db(self):
+        extractor = WordNgrams(n=2, splitter=" ", padder="#")
+        db = HashDb(extractor)
+        db.insert("foo bar")
+        searcher = Searcher(db, Cosine())
+        results = searcher.search("foo bar", 1.0)
+        assert results == ["foo bar"]
+
+    def test_invalid_extractor_in_db(self):
+        with pytest.raises(TypeError, match="Extractor must be CharacterNgrams, WordNgrams, or CustomExtractor"):
+            HashDb("not an extractor")
+
+    def test_ranked_search_error_on_invalid_threshold(self):
+        with pytest.raises(SearchError, match=r"Invalid threshold: 1\.1"):
+            self.searcher.ranked_search("test", 1.1)
diff --git a/tests/python/test_errors.py b/tests/python/test_errors.py
@@ -0,0 +1,31 @@
+import pytest
+import multiprocessing
+import sys
+from simstring_rust.extractors import CustomExtractor
+from simstring_rust.database import HashDb
+
+def run_crashing_extractor():  # pragma: no cover
+    class CrashingExtractor:
+        def apply(self, text):
+            raise ValueError("Crash!")
+
+    extractor = CustomExtractor(CrashingExtractor())
+    db = HashDb(extractor)
+    # This should panic the Rust side because of the unhandled exception in the callback
+    db.insert("foo")
+
+def test_custom_extractor_panic():
+    # Run the crashing code in a separate process
+    p = multiprocessing.Process(target=run_crashing_extractor)
+    p.start()
+    p.join()
+    
+    # Check if the process exited with an error (panic usually causes non-zero exit code)
+    assert p.exitcode != 0
+
+def test_custom_extractor_missing_apply():
+    class BadExtractor:
+        pass
+
+    with pytest.raises(TypeError, match="Custom extractor must provide an apply"):
+        CustomExtractor(BadExtractor())
diff --git a/tests/python/test_measures.py b/tests/python/test_measures.py
@@ -0,0 +1,56 @@
+import pytest
+from simstring_rust.database import HashDb
+from simstring_rust.extractors import CharacterNgrams
+from simstring_rust.measures import Dice, Jaccard, Overlap, ExactMatch
+from simstring_rust.searcher import Searcher
+
+class TestMeasures:
+    def setup_method(self):
+        self.extractor = CharacterNgrams(n=2, endmarker="$")
+        self.db = HashDb(self.extractor)
+        self.db.insert("foo")
+        self.db.insert("bar")
+        self.db.insert("fooo")
+
+    def test_dice(self):
+        searcher = Searcher(self.db, Dice())
+        results = searcher.ranked_search("foo", 0.8)
+        # "foo" (4 features) vs "foo" (4 features) -> 2*4 / (4+4) = 1.0
+        # "foo" vs "fooo" (5 features) -> intersect is 4 ($f, fo, oo, o$) -> 2*4 / (4+5) = 8/9 ~= 0.88
+        assert len(results) == 2
+        assert results[0][0] == "foo"
+        assert results[0][1] == pytest.approx(1.0)
+        assert results[1][0] == "fooo"
+        assert results[1][1] == pytest.approx(0.88888888)
+
+    def test_jaccard(self):
+        searcher = Searcher(self.db, Jaccard())
+        results = searcher.ranked_search("foo", 0.8)
+        # "foo" vs "foo" -> 1.0
+        # "foo" vs "fooo" -> 4 / 5 = 0.8
+        assert len(results) == 2
+        assert results[0][0] == "foo"
+        assert results[0][1] == pytest.approx(1.0)
+        assert results[1][0] == "fooo"
+        assert results[1][1] == pytest.approx(0.8)
+
+    def test_overlap(self):
+        searcher = Searcher(self.db, Overlap())
+        results = searcher.ranked_search("foo", 0.8)
+
+        assert len(results) == 2
+        assert results[0][0] == "foo"
+        assert results[0][1] == pytest.approx(1.0)
+        assert results[1][0] == "fooo"
+        assert results[1][1] == pytest.approx(1.0)
+
+    def test_exact_match(self):
+        searcher = Searcher(self.db, ExactMatch())
+        results = searcher.ranked_search("foo", 1.0)
+        assert len(results) == 1
+        assert results[0][0] == "foo"
+        assert results[0][1] == pytest.approx(1.0)
+
+        results_partial = searcher.ranked_search("foo", 0.5)
+        assert len(results_partial) == 1
+        assert results_partial[0][0] == "foo"
diff --git a/tests/test_features.rs b/tests/test_features.rs
@@ -229,3 +229,30 @@ mod word_ngrams_tests {
         );
     }
 }
+
+#[test]
+fn test_character_ngrams_input_shorter_than_n() {
+    // The condition `total_len < self.n` is only reachable if `text_len + 2*(n-1) < n`.
+    // This simplifies to `text_len + n < 2`.
+    // This is only possible if n=1 and text_len=0.
+    // For any n >= 2, the padding ensures total_len >= n.
+
+    let extractor = CharacterNgrams::new(1, "$");
+    let mut interner = Rodeo::default();
+
+    // "" -> len 0. n=1. padding=0. total_len=0. 0 < 1.
+    let features = extractor.features("", &mut interner);
+    assert!(
+        features.is_empty(),
+        "Features should be empty when input length is shorter than n (and n=1)"
+    );
+}
+
+#[test]
+fn test_word_ngrams_n_zero() {
+    let extractor = WordNgrams::new(0, " ", "#");
+    let mut interner = Rodeo::default();
+
+    let features = extractor.features("hello world", &mut interner);
+    assert!(features.is_empty(), "Features should be empty when n=0");
+}
diff --git a/tests/test_measures.rs b/tests/test_measures.rs
diff --git a/tests/test_search.rs b/tests/test_search.rs