Skip to content

Commit 083eb2f

Browse files
authored
Initial API structure (#1)
* WIP: Initial API
1 parent 32d6a3b commit 083eb2f

File tree

22 files changed

+1424
-1
lines changed

22 files changed

+1424
-1
lines changed

.github/workflows/CI.yml

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
name: Rust CI/CD
2+
3+
on:
4+
push:
5+
branches: ["main"]
6+
pull_request:
7+
branches: ["main"]
8+
9+
env:
10+
CARGO_TERM_COLOR: always
11+
RUST_BACKTRACE: 1
12+
13+
jobs:
14+
test:
15+
name: Test on ${{ matrix.os }} / ${{ matrix.rust }}
16+
runs-on: ${{ matrix.os }}
17+
strategy:
18+
fail-fast: false
19+
matrix:
20+
os: [ubuntu-latest, windows-latest, macos-latest]
21+
rust: [stable]
22+
include:
23+
- os: ubuntu-latest
24+
rust: stable
25+
target: x86_64-unknown-linux-gnu
26+
- os: ubuntu-latest
27+
rust: stable
28+
target: aarch64-unknown-linux-gnu
29+
- os: windows-latest
30+
rust: stable
31+
target: x86_64-pc-windows-msvc
32+
- os: macos-latest
33+
rust: stable
34+
target: x86_64-apple-darwin
35+
- os: macos-latest
36+
rust: stable
37+
target: aarch64-apple-darwin
38+
39+
steps:
40+
- uses: actions/checkout@v3
41+
42+
- name: Install Rust toolchain
43+
uses: dtolnay/rust-toolchain@stable
44+
with:
45+
toolchain: ${{ matrix.rust }}
46+
target: ${{ matrix.target }}
47+
components: rustfmt, clippy
48+
49+
- name: Install cross
50+
if: matrix.target == 'aarch64-unknown-linux-gnu'
51+
run: cargo install cross
52+
53+
- name: Set up cargo cache
54+
uses: actions/cache@v3
55+
with:
56+
path: |
57+
~/.cargo/bin/
58+
~/.cargo/registry/index/
59+
~/.cargo/registry/cache/
60+
~/.cargo/git/db/
61+
target/
62+
key: ${{ runner.os }}-${{ matrix.target }}-cargo-${{ hashFiles('**/Cargo.lock') }}
63+
restore-keys: ${{ runner.os }}-${{ matrix.target }}-cargo-
64+
65+
- name: Check formatting
66+
run: cargo fmt --all -- --check
67+
68+
- name: Run clippy
69+
if: matrix.target != 'aarch64-unknown-linux-gnu'
70+
run: cargo clippy --target ${{ matrix.target }} -- -D warnings
71+
72+
- name: Run clippy (cross)
73+
if: matrix.target == 'aarch64-unknown-linux-gnu'
74+
run: cross clippy --target ${{ matrix.target }} -- -D warnings
75+
76+
- name: Run tests
77+
if: matrix.target != 'aarch64-unknown-linux-gnu'
78+
run: cargo test --target ${{ matrix.target }} --verbose
79+
80+
- name: Run tests (cross)
81+
if: matrix.target == 'aarch64-unknown-linux-gnu'
82+
run: cross test --target ${{ matrix.target }} --verbose
83+
84+
- name: Build
85+
if: matrix.target != 'aarch64-unknown-linux-gnu'
86+
run: cargo build --target ${{ matrix.target }} --verbose
87+
88+
- name: Build (cross)
89+
if: matrix.target == 'aarch64-unknown-linux-gnu'
90+
run: cross build --target ${{ matrix.target }} --verbose
91+
92+
# Documentation check
93+
- name: Check documentation
94+
run: cargo doc --no-deps --document-private-items
95+
96+
# Publish to crates.io on new tags
97+
publish:
98+
name: Publish to crates.io
99+
needs: test
100+
if: startsWith(github.ref, 'refs/tags/v')
101+
runs-on: ubuntu-latest
102+
steps:
103+
- uses: actions/checkout@v3
104+
105+
- name: Install Rust toolchain
106+
uses: dtolnay/rust-toolchain@stable
107+
108+
- name: Verify tag version matches Cargo.toml version
109+
run: |
110+
CARGO_VERSION=$(cargo pkgid | cut -d# -f2)
111+
TAG_VERSION=${GITHUB_REF#refs/tags/v}
112+
if [ "$CARGO_VERSION" != "$TAG_VERSION" ]; then
113+
echo "Error: Git tag $TAG_VERSION doesn't match Cargo.toml version $CARGO_VERSION"
114+
exit 1
115+
fi
116+
117+
- name: Check package
118+
run: cargo package
119+
120+
- name: Publish to crates.io
121+
run: cargo publish
122+
env:
123+
CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
/target
2+
Session.vim

Cargo.toml

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,19 @@
11
[package]
22
name = "simstring_rs"
33
version = "0.1.0"
4+
description = "A Rust implementation of the SimString algorithm"
5+
license = "MIT"
6+
repository = "https://github.com/PyDataBlog/simstring_rs"
7+
documentation = "https://docs.rs/simstringrs"
48
edition = "2021"
9+
authors = ["Bernard Brenyah <bbrenyah@gmail.com>"]
10+
keywords = ["string", "matching", "nlp", "algorithm", "simstring", "cpmerge"]
11+
categories = ["text-processing", "nlp"]
12+
homepage = "https://github.com/PyDataBlog/simstring_rs#readme"
13+
readme = "README.md"
514

6-
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
15+
[lib]
16+
name = "simstring_rs"
17+
path = "src/lib.rs"
718

819
[dependencies]

LICENSE

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2021 Bernard Brenyah
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

README.md

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# simstring_rs
2+
3+
A native Rust implementation of the CPMerge algorithm, designed for approximate string matching. This crate is particularly useful for natural language processing tasks that require the retrieval of strings/texts from very large corpora (big amounts of texts). Currently, this crate supports both character and word-based N-grams feature generation, with plans to allow custom user-defined feature generation methods.
4+
5+
## Features
6+
7+
- ✅ Fast algorithm for string matching
8+
- ✅ 100% exact retrieval
9+
- ✅ Support for Unicode
10+
- [ ] Support for building databases directly from text files
11+
- [ ] Mecab-based tokenizer support
12+
- [ ] Support for persistent databases like MongoDB
13+
14+
## Supported String Similarity Measures
15+
16+
- ✅ Dice coefficient
17+
- ✅ Jaccard coefficient
18+
- ✅ Cosine coefficient
19+
- ✅ Overlap coefficient
20+
- ✅ Exact match
21+
22+
## Installation
23+
24+
Add `simstring_rs` to your `Cargo.toml`:
25+
26+
```toml
27+
[dependencies]
28+
simstring_rs = "0.1.0"
29+
```
30+
31+
For the latest features, you can add the master branch by specifying the Git repository:
32+
33+
```toml
34+
[dependencies]
35+
simstring_rs = { git = "https://github.com/PyDataBlog/simstring_rs.git", branch = "main" }
36+
```
37+
38+
Note: Using the master branch may include experimental features and potential breakages. Use with caution!
39+
40+
To revert to a stable version, ensure your Cargo.toml specifies a specific version number instead of the Git repository.
41+
42+
## Usage
43+
44+
Here is a basic example of how to use simstring_rs in your Rust project:
45+
46+
```Rust
47+
48+
```
49+
50+
## Contributing
51+
52+
Contributions are welcome! Please open an issue or submit a pull request on GitHub.
53+
License
54+
55+
This project is licensed under the MIT License.
56+
57+
## Acknowledgements
58+
59+
Inspired by the [SimString.jl](https://github.com/PyDataBlog/SimString.jl) project.

examples/basic_usage.rs

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
use simstring_rs::database::{HashDB, SimStringDB};
2+
use simstring_rs::extractors::{CharacterNGrams, FeatureExtractor};
3+
use simstring_rs::measures::Cosine;
4+
5+
fn main() {
6+
let _cs = Cosine::new();
7+
8+
let feature_extractor = CharacterNGrams {
9+
n: 3,
10+
padder: " ".to_string(),
11+
};
12+
13+
let mut db = HashDB::new(feature_extractor);
14+
15+
db.insert("hello".to_string());
16+
db.insert("help".to_string());
17+
db.insert("halo".to_string());
18+
db.insert("world".to_string());
19+
20+
let (total_collection, avg_size_ngrams, total_ngrams) = db.describe_collection();
21+
println!(
22+
"Database contains {} strings, average n-gram size {:.2}, total n-grams {}.",
23+
total_collection, avg_size_ngrams, total_ngrams
24+
);
25+
26+
//println!("Complete DB State: {:?}", db); # FIX: db needs a fmt.debug implementation
27+
28+
let query = "prepress";
29+
30+
let query_features = db.feature_extractor.extract(query);
31+
let query_size = query_features.len();
32+
33+
println!("Query size: {}", query_size);
34+
35+
println!("Extracted features from query '{}':", query);
36+
for (feature, count) in &query_features {
37+
println!(" - Feature: '{}', Count: {}", feature, count);
38+
}
39+
}

src/database/hashdb.rs

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
use crate::{FeatureExtractor, SimStringDB};
2+
use std::collections::{HashMap, HashSet};
3+
4+
pub struct HashDB<TExtractor>
5+
where
6+
TExtractor: FeatureExtractor,
7+
{
8+
pub feature_extractor: TExtractor,
9+
pub string_collection: Vec<String>,
10+
pub string_size_map: HashMap<usize, HashSet<String>>,
11+
pub string_feature_map: HashMap<usize, HashMap<(String, i32), HashSet<String>>>,
12+
pub lookup_cache: HashMap<(usize, (String, i32)), HashSet<String>>,
13+
}
14+
15+
impl<TExtractor> HashDB<TExtractor>
16+
where
17+
TExtractor: FeatureExtractor,
18+
{
19+
pub fn new(feature_extractor: TExtractor) -> Self {
20+
HashDB {
21+
feature_extractor,
22+
string_collection: Vec::new(),
23+
string_size_map: HashMap::new(),
24+
string_feature_map: HashMap::new(),
25+
lookup_cache: HashMap::new(),
26+
}
27+
}
28+
29+
pub fn lookup_feature_set_by_size_feature(
30+
&mut self,
31+
size: usize,
32+
feature: &(String, i32),
33+
) -> &HashSet<String> {
34+
let cache_key = (size, feature.clone());
35+
36+
self.lookup_cache
37+
.entry(cache_key.clone())
38+
.or_insert_with(|| {
39+
// If not in cache, retrieve from string_feature_map or return an empty set
40+
self.string_feature_map
41+
.get(&size)
42+
.and_then(|feature_map| feature_map.get(feature))
43+
.cloned()
44+
.unwrap_or_else(HashSet::new)
45+
})
46+
}
47+
}
48+
49+
impl<TExtractor> SimStringDB for HashDB<TExtractor>
50+
where
51+
TExtractor: FeatureExtractor,
52+
{
53+
fn get_max_feature_size(&self) -> usize {
54+
*self.string_feature_map.keys().max().unwrap_or(&0)
55+
}
56+
57+
fn insert(&mut self, s: String) {
58+
// Add the string to the collection
59+
self.string_collection.push(s.clone());
60+
61+
// Extract features from the string
62+
let features = self.feature_extractor.extract(&s);
63+
64+
// Determine the size (number of features)
65+
let size = features.len();
66+
67+
// Update string_size_map
68+
self.string_size_map
69+
.entry(size)
70+
.or_default()
71+
.insert(s.clone());
72+
73+
// Update string_feature_map
74+
let feature_map = self.string_feature_map.entry(size).or_default();
75+
76+
for (feature, count) in features {
77+
let key = (feature.clone(), count);
78+
79+
feature_map.entry(key).or_default().insert(s.clone());
80+
}
81+
}
82+
83+
fn describe_collection(&self) -> (usize, f64, usize) {
84+
let total_collection = self.string_collection.len();
85+
86+
let total_sizes: usize = self
87+
.string_size_map
88+
.iter()
89+
.map(|(size, strings)| size * strings.len())
90+
.sum();
91+
let total_strings: usize = self
92+
.string_size_map
93+
.values()
94+
.map(|strings| strings.len())
95+
.sum();
96+
let avg_size_ngrams = if total_strings == 0 {
97+
0.0
98+
} else {
99+
total_sizes as f64 / total_strings as f64
100+
};
101+
102+
let total_ngrams: usize = self
103+
.string_feature_map
104+
.values()
105+
.map(|feature_map| feature_map.len())
106+
.sum();
107+
108+
(total_collection, avg_size_ngrams, total_ngrams)
109+
}
110+
}

src/database/mod.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
mod hashdb;
2+
3+
pub trait SimStringDB {
4+
fn insert(&mut self, s: String);
5+
fn describe_collection(&self) -> (usize, f64, usize);
6+
fn get_max_feature_size(&self) -> usize;
7+
}
8+
9+
pub use hashdb::HashDB;

0 commit comments

Comments
 (0)