docs: Updated readme (#9)

Pringled · web-flow · commit 6ae4835b228f · 2025-05-15T20:34:54.000+02:00
diff --git a/Cargo.toml b/Cargo.toml
@@ -2,7 +2,7 @@
 name = "model2vec-rs"
 version = "0.1.0"
 edition = "2021"
-description = "Official Rust Implementation for Model2Vec"
+description = "Official Rust Implementation of Model2Vec"
 readme = "README.md"
 license-file = "LICENSE"
 authors = ["Thomas van Dongen <thomas123@live.nl>", "Stéphan Tulkens <stephantul@gmail.com>"]
diff --git a/README.md b/README.md
@@ -1,17 +1,27 @@
-# model2vec-rs
 
-This crate provides a lightweight Rust implementation for loading and running inference on [Model2Vec](https://github.com/MinishLab/model2vec) static embedding models from either local folders or the Hugging Face Hub.
+<div align="center">
+    <picture>
+      <img width="35%" alt="Model2Vec logo" src="assets/images/model2vec_rs_logo.png">
+    </picture>
+</div>
+
+<div align="center">
+
+[Quickstart](#quickstart) •
+[Models](#models) •
+[Performance](#performance)
+</div>
+
+This crate provides a lightweight Rust implementation for loading and inference of [Model2Vec](https://github.com/MinishLab/model2vec) static embedding models. For distillation and training, the [Python Model2Vec package](https://github.com/MinishLab/model2vec) can be used.
+
+
 
 ## Quick Start
 
-Install the crate:
+Add the crate:
 
 ```bash
-git clone https://github.com/minishlab/model2vec-rust.git
-cd model2vec-rs
-
-# Build
-cargo build --release
+cargo add model2vec-rs
 ```
 
 Make embeddings:
@@ -62,6 +72,34 @@ let embeddings = model.encode_with_args(
 );
 ```
 
+## Models
+
+We provide a number of models that can be used out of the box. These models are available on the [HuggingFace hub](https://huggingface.co/collections/minishlab/model2vec-base-models-66fd9dd9b7c3b3c0f25ca90e) and can be loaded using the `from_pretrained` method. The models are listed below.
+
+
+
+| Model                                                                 | Language    | Sentence Transformer                                            | Params  | Task      |
+|-----------------------------------------------------------------------|------------|-----------------------------------------------------------------|---------|-----------|
+| [potion-base-32M](https://huggingface.co/minishlab/potion-base-32M)   | English    | [bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) | 32.3M   | General   |
+| [potion-base-8M](https://huggingface.co/minishlab/potion-base-8M)     | English    | [bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) | 7.5M    | General   |
+| [potion-base-4M](https://huggingface.co/minishlab/potion-base-4M)     | English    | [bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) | 3.7M    | General   |
+| [potion-base-2M](https://huggingface.co/minishlab/potion-base-2M)     | English    | [bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) | 1.8M    | General   |
+| [potion-retrieval-32M](https://huggingface.co/minishlab/potion-retrieval-32M) | English    | [bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) | 32.3M   | Retrieval |
+| [M2V_multilingual_output](https://huggingface.co/minishlab/M2V_multilingual_output) | Multilingual | [LaBSE](https://huggingface.co/sentence-transformers/LaBSE)      | 471M    | General   |
+
+
+## Performance
+
+We compared the performance of the Rust implementation with the Python version of Model2Vec. The benchmark was run single-threaded on a CPU.
+
+| Implementation | Throughput                                         |
+| -------------- | -------------------------------------------------- |
+| **Rust**       | 8000 sentences |
+| **Python**     | 4650 sentences |
+
+The Rust version is roughly **1.7×** faster than the Python version.
+
+
 ## License
 
 MIT
diff --git a/assets/images/model2vec_rs_logo.png b/assets/images/model2vec_rs_logo.png
diff --git a/tests/test_model.rs b/tests/test_model.rs
@@ -6,15 +6,16 @@ use model2vec_rs::model::StaticModel;
 
 #[test]
 fn test_encode_matches_python_model2vec() {
-    // Load your test model once
+    // Load the test model
     let model = load_test_model();
 
-    // Define (fixture path, inputs) for both short and long cases
-    let long_text = vec!["hello"; 1000].join(" ");  // 1 000 “hello”s
+    // Define the short and long text inputs
+    let long_text = vec!["hello"; 1000].join(" ");
+    let short_text = "hello world".to_string();
     let cases = vec![
         (
             "tests/fixtures/embeddings_short.json",
-            vec!["hello world".to_string()],
+            vec![short_text],
         ),
         (
             "tests/fixtures/embeddings_long.json",
@@ -29,7 +30,7 @@ fn test_encode_matches_python_model2vec() {
         let expected: Vec<Vec<f32>> = serde_json::from_str(&fixture)
             .expect("Failed to parse fixture");
 
-        // Encode with your Rust model
+        // Encode with the Rust model
         let output = model.encode(&inputs);
 
         // Sanity checks
@@ -53,16 +54,15 @@ fn test_encode_matches_python_model2vec() {
     }
 }
 
-
-/// Test that encoding an empty input slice yields an empty Vec
+/// Test that encoding an empty input slice yields an empty output
 #[test]
 fn test_encode_empty_input() {
     let model = load_test_model();
     let embs: Vec<Vec<f32>> = model.encode(&[]);
     assert!(embs.is_empty(), "Expected no embeddings for empty input");
 }
 
-/// Test encoding a single empty sentence produces a zero vector with no NaNs
+/// Test that encoding a single empty sentence produces a zero vector
 #[test]
 fn test_encode_empty_sentence() {
     let model = load_test_model();
@@ -75,21 +75,21 @@ fn test_encode_empty_sentence() {
 /// Test override of `normalize` flag in from_pretrained
 #[test]
 fn test_normalization_flag_override() {
-    // first load with normalize = true (default in config)
+    // Load with normalize = true (default in config)
     let model_norm = StaticModel::from_pretrained(
         "tests/fixtures/test-model-float32", None, None, None
     ).unwrap();
     let emb_norm = model_norm.encode(&["test sentence".to_string()])[0].clone();
     let norm_norm = emb_norm.iter().map(|&x| x*x).sum::<f32>().sqrt();
 
-    // now load with normalize = false override
+    // Load with normalize = false override
     let model_no_norm = StaticModel::from_pretrained(
         "tests/fixtures/test-model-float32", None, Some(false), None
     ).unwrap();
     let emb_no = model_no_norm.encode(&["test sentence".to_string()])[0].clone();
     let norm_no = emb_no.iter().map(|&x| x*x).sum::<f32>().sqrt();
 
-    // normalized version should have unit length, override should give larger norm
+    // Normalized version should have unit length, override should give larger norm
     assert!((norm_norm - 1.0).abs() < 1e-5, "Normalized vector should have unit norm");
     assert!(norm_no > norm_norm, "Without normalization override, norm should be larger");
 }