PyDataBlog · PyDataBlog · Dec 5, 2024 · Jul 3, 2024 · Jul 8, 2024 · Jul 20, 2024
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -0,0 +1,123 @@
+name: Rust CI/CD
+
+on:
+  push:
+    branches: ["main"]
+  pull_request:
+    branches: ["main"]
+
+env:
+  CARGO_TERM_COLOR: always
+  RUST_BACKTRACE: 1
+
+jobs:
+  test:
+    name: Test on ${{ matrix.os }} / ${{ matrix.rust }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, windows-latest, macos-latest]
+        rust: [stable]
+        include:
+          - os: ubuntu-latest
+            rust: stable
+            target: x86_64-unknown-linux-gnu
+          - os: ubuntu-latest
+            rust: stable
+            target: aarch64-unknown-linux-gnu
+          - os: windows-latest
+            rust: stable
+            target: x86_64-pc-windows-msvc
+          - os: macos-latest
+            rust: stable
+            target: x86_64-apple-darwin
+          - os: macos-latest
+            rust: stable
+            target: aarch64-apple-darwin
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Install Rust toolchain
+        uses: dtolnay/rust-toolchain@stable
+        with:
+          toolchain: ${{ matrix.rust }}
+          target: ${{ matrix.target }}
+          components: rustfmt, clippy
+
+      - name: Install cross
+        if: matrix.target == 'aarch64-unknown-linux-gnu'
+        run: cargo install cross
+
+      - name: Set up cargo cache
+        uses: actions/cache@v3
+        with:
+          path: |
+            ~/.cargo/bin/
+            ~/.cargo/registry/index/
+            ~/.cargo/registry/cache/
+            ~/.cargo/git/db/
+            target/
+          key: ${{ runner.os }}-${{ matrix.target }}-cargo-${{ hashFiles('**/Cargo.lock') }}
+          restore-keys: ${{ runner.os }}-${{ matrix.target }}-cargo-
+
+      - name: Check formatting
+        run: cargo fmt --all -- --check
+
+      - name: Run clippy
+        if: matrix.target != 'aarch64-unknown-linux-gnu'
+        run: cargo clippy --target ${{ matrix.target }} -- -D warnings
+
+      - name: Run clippy (cross)
+        if: matrix.target == 'aarch64-unknown-linux-gnu'
+        run: cross clippy --target ${{ matrix.target }} -- -D warnings
+
+      - name: Run tests
+        if: matrix.target != 'aarch64-unknown-linux-gnu'
+        run: cargo test --target ${{ matrix.target }} --verbose
+
+      - name: Run tests (cross)
+        if: matrix.target == 'aarch64-unknown-linux-gnu'
+        run: cross test --target ${{ matrix.target }} --verbose
+
+      - name: Build
+        if: matrix.target != 'aarch64-unknown-linux-gnu'
+        run: cargo build --target ${{ matrix.target }} --verbose
+
+      - name: Build (cross)
+        if: matrix.target == 'aarch64-unknown-linux-gnu'
+        run: cross build --target ${{ matrix.target }} --verbose
+
+      # Documentation check
+      - name: Check documentation
+        run: cargo doc --no-deps --document-private-items
+
+  # Publish to crates.io on new tags
+  publish:
+    name: Publish to crates.io
+    needs: test
+    if: startsWith(github.ref, 'refs/tags/v')
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Install Rust toolchain
+        uses: dtolnay/rust-toolchain@stable
+
+      - name: Verify tag version matches Cargo.toml version
+        run: |
+          CARGO_VERSION=$(cargo pkgid | cut -d# -f2)
+          TAG_VERSION=${GITHUB_REF#refs/tags/v}
+          if [ "$CARGO_VERSION" != "$TAG_VERSION" ]; then
+            echo "Error: Git tag $TAG_VERSION doesn't match Cargo.toml version $CARGO_VERSION"
+            exit 1
+          fi
+
+      - name: Check package
+        run: cargo package
+
+      - name: Publish to crates.io
+        run: cargo publish
+        env:
+          CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
diff --git a/.gitignore b/.gitignore
@@ -1 +1,2 @@
 /target
+Session.vim
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,8 +1,19 @@
 [package]
 name = "simstring_rs"
 version = "0.1.0"
+description = "A Rust implementation of the SimString algorithm"
+license = "MIT"
+repository = "https://github.com/PyDataBlog/simstring_rs"
+documentation = "https://docs.rs/simstringrs"
 edition = "2021"
+authors = ["Bernard Brenyah <bbrenyah@gmail.com>"]
+keywords = ["string", "matching", "nlp", "algorithm", "simstring", "cpmerge"]
+categories = ["text-processing", "nlp"]
+homepage = "https://github.com/PyDataBlog/simstring_rs#readme"
+readme = "README.md"
 
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+[lib]
+name = "simstring_rs"
+path = "src/lib.rs"
 
 [dependencies]
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Bernard Brenyah
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,59 @@
+# simstring_rs
+
+A native Rust implementation of the CPMerge algorithm, designed for approximate string matching. This crate is particularly useful for natural language processing tasks that require the retrieval of strings/texts from very large corpora (big amounts of texts). Currently, this crate supports both character and word-based N-grams feature generation, with plans to allow custom user-defined feature generation methods.
+
+## Features
+
+- ✅ Fast algorithm for string matching
+- ✅ 100% exact retrieval
+- ✅ Support for Unicode
+- [ ] Support for building databases directly from text files
+- [ ] Mecab-based tokenizer support
+- [ ] Support for persistent databases like MongoDB
+
+## Supported String Similarity Measures
+
+- ✅ Dice coefficient
+- ✅ Jaccard coefficient
+- ✅ Cosine coefficient
+- ✅ Overlap coefficient
+- ✅ Exact match
+
+## Installation
+
+Add `simstring_rs` to your `Cargo.toml`:
+
+```toml
+[dependencies]
+simstring_rs = "0.1.0"
+```
+
+For the latest features, you can add the master branch by specifying the Git repository:
+
+```toml
+[dependencies]
+simstring_rs = { git = "https://github.com/PyDataBlog/simstring_rs.git", branch = "main" }
+```
+
+Note: Using the master branch may include experimental features and potential breakages. Use with caution!
+
+To revert to a stable version, ensure your Cargo.toml specifies a specific version number instead of the Git repository.
+
+## Usage
+
+Here is a basic example of how to use simstring_rs in your Rust project:
+
+```Rust
+
+```
+
+## Contributing
+
+Contributions are welcome! Please open an issue or submit a pull request on GitHub.
+License
+
+This project is licensed under the MIT License.
+
+## Acknowledgements
+
+Inspired by the [SimString.jl](https://github.com/PyDataBlog/SimString.jl) project.
diff --git a/examples/basic_usage.rs b/examples/basic_usage.rs
@@ -0,0 +1,39 @@
+use simstring_rs::database::{HashDB, SimStringDB};
+use simstring_rs::extractors::{CharacterNGrams, FeatureExtractor};
+use simstring_rs::measures::Cosine;
+
+fn main() {
+    let _cs = Cosine::new();
+
+    let feature_extractor = CharacterNGrams {
+        n: 3,
+        padder: " ".to_string(),
+    };
+
+    let mut db = HashDB::new(feature_extractor);
+
+    db.insert("hello".to_string());
+    db.insert("help".to_string());
+    db.insert("halo".to_string());
+    db.insert("world".to_string());
+
+    let (total_collection, avg_size_ngrams, total_ngrams) = db.describe_collection();
+    println!(
+        "Database contains {} strings, average n-gram size {:.2}, total n-grams {}.",
+        total_collection, avg_size_ngrams, total_ngrams
+    );
+
+    //println!("Complete DB State: {:?}", db); # FIX: db needs a fmt.debug implementation
+
+    let query = "prepress";
+
+    let query_features = db.feature_extractor.extract(query);
+    let query_size = query_features.len();
+
+    println!("Query size: {}", query_size);
+
+    println!("Extracted features from query '{}':", query);
+    for (feature, count) in &query_features {
+        println!(" - Feature: '{}', Count: {}", feature, count);
+    }
+}
diff --git a/src/database/hashdb.rs b/src/database/hashdb.rs
@@ -0,0 +1,110 @@
+use crate::{FeatureExtractor, SimStringDB};
+use std::collections::{HashMap, HashSet};
+
+pub struct HashDB<TExtractor>
+where
+    TExtractor: FeatureExtractor,
+{
+    pub feature_extractor: TExtractor,
+    pub string_collection: Vec<String>,
+    pub string_size_map: HashMap<usize, HashSet<String>>,
+    pub string_feature_map: HashMap<usize, HashMap<(String, i32), HashSet<String>>>,
+    pub lookup_cache: HashMap<(usize, (String, i32)), HashSet<String>>,
+}
+
+impl<TExtractor> HashDB<TExtractor>
+where
+    TExtractor: FeatureExtractor,
+{
+    pub fn new(feature_extractor: TExtractor) -> Self {
+        HashDB {
+            feature_extractor,
+            string_collection: Vec::new(),
+            string_size_map: HashMap::new(),
+            string_feature_map: HashMap::new(),
+            lookup_cache: HashMap::new(),
+        }
+    }
+
+    pub fn lookup_feature_set_by_size_feature(
+        &mut self,
+        size: usize,
+        feature: &(String, i32),
+    ) -> &HashSet<String> {
+        let cache_key = (size, feature.clone());
+
+        self.lookup_cache
+            .entry(cache_key.clone())
+            .or_insert_with(|| {
+                // If not in cache, retrieve from string_feature_map or return an empty set
+                self.string_feature_map
+                    .get(&size)
+                    .and_then(|feature_map| feature_map.get(feature))
+                    .cloned()
+                    .unwrap_or_else(HashSet::new)
+            })
+    }
+}
+
+impl<TExtractor> SimStringDB for HashDB<TExtractor>
+where
+    TExtractor: FeatureExtractor,
+{
+    fn get_max_feature_size(&self) -> usize {
+        *self.string_feature_map.keys().max().unwrap_or(&0)
+    }
+
+    fn insert(&mut self, s: String) {
+        // Add the string to the collection
+        self.string_collection.push(s.clone());
+
+        // Extract features from the string
+        let features = self.feature_extractor.extract(&s);
+
+        // Determine the size (number of features)
+        let size = features.len();
+
+        // Update string_size_map
+        self.string_size_map
+            .entry(size)
+            .or_default()
+            .insert(s.clone());
+
+        // Update string_feature_map
+        let feature_map = self.string_feature_map.entry(size).or_default();
+
+        for (feature, count) in features {
+            let key = (feature.clone(), count);
+
+            feature_map.entry(key).or_default().insert(s.clone());
+        }
+    }
+
+    fn describe_collection(&self) -> (usize, f64, usize) {
+        let total_collection = self.string_collection.len();
+
+        let total_sizes: usize = self
+            .string_size_map
+            .iter()
+            .map(|(size, strings)| size * strings.len())
+            .sum();
+        let total_strings: usize = self
+            .string_size_map
+            .values()
+            .map(|strings| strings.len())
+            .sum();
+        let avg_size_ngrams = if total_strings == 0 {
+            0.0
+        } else {
+            total_sizes as f64 / total_strings as f64
+        };
+
+        let total_ngrams: usize = self
+            .string_feature_map
+            .values()
+            .map(|feature_map| feature_map.len())
+            .sum();
+
+        (total_collection, avg_size_ngrams, total_ngrams)
+    }
+}
diff --git a/src/database/mod.rs b/src/database/mod.rs
@@ -0,0 +1,9 @@
+mod hashdb;
+
+pub trait SimStringDB {
+    fn insert(&mut self, s: String);
+    fn describe_collection(&self) -> (usize, f64, usize);
+    fn get_max_feature_size(&self) -> usize;
+}
+
+pub use hashdb::HashDB;