Skip to content

Commit d53e85b

Browse files
authored
feat(perf): better search performance by avoiding allocations (#40)
* perf(search): Initial optimization round which improves searches * feat(test): Switched to grcov for code coverage
1 parent 5e3efe2 commit d53e85b

File tree

24 files changed

+337
-135
lines changed

24 files changed

+337
-135
lines changed

.github/ISSUE_TEMPLATE/config.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
blank_issues_enabled: false
22
contact_links:
33
- name: Ask a question
4-
url: https://github.com/bebr/simstring_rs/discussions/new
4+
url: https://github.com/PyDataBlog/simstring_rs/discussions/new
55
about: Please ask and answer questions here.

.github/PULL_REQUEST_TEMPLATE.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ about: Propose a change to the project
44
title: "feat(scope): describe your change"
55
labels: ""
66
assignees: ""
7+
78
---
89

910
**Description**

.github/workflows/coverage.yml

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -31,18 +31,27 @@ jobs:
3131
with:
3232
python-version: "3.13"
3333

34-
- name: Install cargo-tarpaulin
35-
run: cargo install cargo-tarpaulin --version 0.26.0 --force
36-
37-
- name: Set Python interpreter for pyo3
38-
run: echo "PYTHON_SYS_EXECUTABLE=$(which python3)" >> $GITHUB_ENV
34+
- name: Install grcov and llvm-tools
35+
run: |
36+
cargo install grcov --force
37+
rustup component add llvm-tools
3938
4039
- name: Run tests and generate coverage report
40+
env:
41+
SIMSTRING_RS_COVERAGE: "1"
4142
run: |
42-
LIBDIR=$(python3 -c "import sysconfig; print(sysconfig.get_config_var('LIBDIR'))")
43-
VERSION=$(python3 -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
44-
export RUSTFLAGS="-L $LIBDIR -lpython$VERSION"
45-
cargo tarpaulin --all-features --out Xml --output-dir ./coverage
43+
# Create directory for coverage reports
44+
mkdir -p ./coverage
45+
# Set flags for coverage generation
46+
export CARGO_INCREMENTAL=0
47+
export RUSTFLAGS="-Cinstrument-coverage"
48+
# Set the path for the raw coverage data
49+
export LLVM_PROFILE_FILE="target/coverage/simstring_rs-%p-%m.profraw"
50+
# Run all tests, including the ignored python bindings
51+
cargo test --all-features -- --include-ignored
52+
# Generate the coverage report
53+
grcov . --binary-path ./target/debug/ -s . -t lcov --branch --ignore-not-existing --ignore "tests/*" --ignore "examples/*" --ignore "build.rs" -o ./coverage.lcov
54+
grcov . --binary-path ./target/debug/ -s . -t cobertura --branch --ignore-not-existing --ignore "tests/*" --ignore "examples/*" --ignore "build.rs" -o ./coverage/cobertura.xml
4655
4756
- name: Upload coverage to Codecov
4857
uses: codecov/codecov-action@v5

Cargo.lock

Lines changed: 9 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,11 @@ path = "src/lib.rs"
1818
crate-type = ["cdylib", "rlib"]
1919

2020
[dependencies]
21-
rayon = "1.10"
2221
ahash = "0.8"
22+
rayon = "1.10"
23+
thiserror = "2.0"
2324
lasso = "0.7"
24-
thiserror = "2"
25+
rustc-hash = "2.1"
2526
pyo3 = { version = "0.25", features = ["extension-module", "abi3-py37"] }
2627
serde = { version = "1.0", features = ["derive"] }
2728
serde_json = "1.0"

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
[![Build Status](https://github.com/PyDataBlog/simstring_rs/actions/workflows/CI.yml/badge.svg)](https://github.com/PyDataBlog/simstring_rs/actions)
44
[![Crates.io](https://img.shields.io/crates/v/simstring_rust.svg)](https://crates.io/crates/simstring_rust)
5+
[![PyPI version](https://badge.fury.io/py/simstring-rust.svg)](https://badge.fury.io/py/simstring-rust)
6+
[![Python versions](https://img.shields.io/pypi/pyversions/simstring-rust.svg)](https://pypi.org/project/simstring-rust)
57
[![Documentation](https://docs.rs/simstring_rust/badge.svg)](https://docs.rs/simstring_rust)
68
[![Rust](https://img.shields.io/badge/rust-1.63.0%2B-blue.svg?maxAge=3600)](https://github.com/PyDataBlog/simstring_rs)
79
[![Codecov](https://img.shields.io/codecov/c/github/PyDataBlog/simstring_rs?token=XJM8O8TD4U)](https://codecov.io/gh/PyDataBlog/simstring_rs)

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ classifiers = [
2020
"Programming Language :: Python :: 3.10",
2121
"Programming Language :: Python :: 3.11",
2222
"Programming Language :: Python :: 3.12",
23+
"Programming Language :: Python :: 3.13",
2324
"License :: OSI Approved :: MIT License",
2425
"Operating System :: POSIX",
2526
"Operating System :: MacOS :: MacOS X",

src/database/hashdb.rs

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
use crate::database::{Database, StringId};
22
use crate::extractors::FeatureExtractor;
3-
use ahash::{AHashMap, AHashSet};
43
use lasso::{Rodeo, Spur};
4+
use rustc_hash::{FxHashMap, FxHashSet};
55
use std::fmt;
66
use std::sync::{Arc, Mutex};
77

88
pub struct HashDb {
99
feature_extractor: Arc<dyn FeatureExtractor>,
1010
pub strings: Vec<String>,
1111
string_features: Vec<Vec<Spur>>,
12-
feature_map: AHashMap<usize, AHashMap<Spur, AHashSet<StringId>>>,
12+
feature_map: FxHashMap<usize, FxHashMap<Spur, FxHashSet<StringId>>>,
1313
interner: Arc<Mutex<Rodeo>>,
1414
}
1515

@@ -37,7 +37,7 @@ impl HashDb {
3737
feature_extractor,
3838
strings: Vec::new(),
3939
string_features: Vec::new(),
40-
feature_map: AHashMap::default(),
40+
feature_map: FxHashMap::default(),
4141
interner: Arc::new(Mutex::new(Rodeo::default())),
4242
}
4343
}
@@ -75,7 +75,7 @@ impl Database for HashDb {
7575
self.interner.lock().unwrap().clear();
7676
}
7777

78-
fn lookup_strings(&self, size: usize, feature: Spur) -> Option<&AHashSet<StringId>> {
78+
fn lookup_strings(&self, size: usize, feature: Spur) -> Option<&FxHashSet<StringId>> {
7979
self.feature_map.get(&size)?.get(&feature)
8080
}
8181

@@ -98,4 +98,8 @@ impl Database for HashDb {
9898
fn interner(&self) -> Arc<Mutex<Rodeo>> {
9999
Arc::clone(&self.interner)
100100
}
101+
102+
fn total_strings(&self) -> usize {
103+
self.strings.len()
104+
}
101105
}

src/database/mod.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,22 @@
11
mod hashdb;
22

33
use crate::extractors::FeatureExtractor;
4-
use ahash::AHashSet;
54
use lasso::{Rodeo, Spur};
5+
use rustc_hash::FxHashSet;
66
use std::sync::{Arc, Mutex};
77

88
pub type StringId = usize;
99

1010
pub trait Database: Send + Sync {
1111
fn insert(&mut self, text: String);
1212
fn clear(&mut self);
13-
fn lookup_strings(&self, size: usize, feature: Spur) -> Option<&AHashSet<StringId>>;
13+
fn lookup_strings(&self, size: usize, feature: Spur) -> Option<&FxHashSet<StringId>>;
1414
fn get_string(&self, id: StringId) -> Option<&str>;
1515
fn get_features(&self, id: StringId) -> Option<&Vec<Spur>>;
1616
fn feature_extractor(&self) -> &dyn FeatureExtractor;
1717
fn max_feature_len(&self) -> usize;
1818
fn interner(&self) -> Arc<Mutex<Rodeo>>;
19+
fn total_strings(&self) -> usize;
1920
}
2021

2122
pub use hashdb::HashDb;

src/extractors/character_ngrams.rs

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,36 @@ impl FeatureExtractor for CharacterNgrams {
2727
if self.n == 0 {
2828
return vec![];
2929
}
30-
let padding = self.endmarker.repeat(self.n.saturating_sub(1));
31-
let padded_text = format!("{padding}{text}{padding}");
32-
33-
let ngrams: Vec<String> = padded_text
34-
.chars()
35-
.collect::<Vec<char>>()
36-
.windows(self.n)
37-
.map(|window| window.iter().collect())
38-
.collect();
30+
31+
// Pre-calculate capacity to avoid reallocations
32+
let text_len = text.chars().count();
33+
let padding_len = self.n.saturating_sub(1);
34+
let total_len = text_len + 2 * padding_len;
35+
36+
if total_len < self.n {
37+
return vec![];
38+
}
39+
40+
let expected_ngrams = total_len - self.n + 1;
41+
let mut ngrams = Vec::with_capacity(expected_ngrams);
42+
43+
let padding = self.endmarker.repeat(padding_len);
44+
45+
// collect chars once, then slice
46+
let mut all_chars = Vec::with_capacity(total_len);
47+
all_chars.extend(padding.chars());
48+
all_chars.extend(text.chars());
49+
all_chars.extend(padding.chars());
50+
51+
// Generate n-grams using efficient windowing
52+
for window in all_chars.windows(self.n) {
53+
// Pre-allocate string with known capacity
54+
let mut ngram = String::with_capacity(self.n * 4); // Assume max 4 bytes per char
55+
for &ch in window {
56+
ngram.push(ch);
57+
}
58+
ngrams.push(ngram);
59+
}
3960

4061
super::append_feature_counts(interner, ngrams)
4162
}

0 commit comments

Comments
 (0)