Merge pull request #212 from unum-cloud/main-dev

Clustering & Performance Improvement
unum-cloud · Aug 24, 2023 · 55e4b05 · 55e4b05
2 parents b7ec32c + 58b043c
commit 55e4b05
Show file tree

Hide file tree

Showing 49 changed files with 3,317 additions and 1,290 deletions.
diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
@@ -66,7 +66,7 @@ jobs:
       - name: Build locally
         run: python -m pip install .
       - name: Test with PyTest
-        run: pytest python/scripts/test.py
+        run: pytest python/scripts/
 
 
   test_python_37:
@@ -95,7 +95,7 @@ jobs:
         run: python -m pip install .
 
       - name: Test with PyTest
-        run: pytest python/scripts/test.py
+        run: pytest python/scripts/
 
 
   test_javascript:
@@ -160,7 +160,7 @@ jobs:
         sudo apt install -y nodejs
         git clone https://github.com/emscripten-core/emsdk.git
         ./emsdk/emsdk install latest
-    - name: Build USearch by Emscripten
+    - name: Build USearch using Emscripten
       run: |
         ./emsdk/emsdk activate latest && source ./emsdk/emsdk_env.sh
         emcmake cmake -DUSEARCH_BUILD_BENCHMARK=0 -DUSEARCH_BUILD_WASM=1 -B ./build -DCMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -s TOTAL_MEMORY=64MB" && emmake make -C ./build

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -111,13 +111,13 @@ jobs:
       - uses: actions/setup-python@v3
 
       - name: Setup Docker
-        if: matrix.os != 'windows-2022'
+        if: matrix.os == 'ubuntu-20.04'
         uses: crazy-max/ghaction-setup-docker@v1.0.0
         with:
           version: 23.0.1
 
       - name: Setup QEMU
-        if: matrix.os != 'windows-2022'
+        if: matrix.os == 'ubuntu-20.04'
         uses: docker/setup-qemu-action@v2.1.0
 
       - name: Install CIBuildWheel
@@ -319,7 +319,7 @@ jobs:
         run: |
               sudo apt update && 
               sudo apt install -y doxygen graphviz dia git && 
-              pip install sphinx sphinx-js breathe furo m2r2 sphinxcontrib-googleanalytics==0.2.dev20220708 sphinxcontrib-jquery && 
+              pip install sphinx==7.1.2 sphinx-js breathe furo m2r2 sphinxcontrib-googleanalytics==0.2.dev20220708 sphinxcontrib-jquery && 
               npm install -g jsdoc
       - name: Install USearch from PyPi
         run: pip install usearch

diff --git a/.github/workflows/update_version.sh b/.github/workflows/update_version.sh
@@ -9,5 +9,5 @@ echo $1 > VERSION &&
     sed -i "s/^\(#define USEARCH_VERSION_MINOR \).*/\1$(echo "$1" | cut -d. -f2)/" ./include/usearch/index.hpp &&
     sed -i "s/^\(#define USEARCH_VERSION_PATCH \).*/\1$(echo "$1" | cut -d. -f3)/" ./include/usearch/index.hpp &&
     sed -i "s/<version>[0-9]\+\.[0-9]\+\.[0-9]\+/<version>$1/" README.md &&
-    sed -i "s/version = {0\.[0-9]\+\.[0-9]\+}/version = {$1}/" README.md &&
+    sed -i "s/version = {[0-9]\+\.[0-9]\+\.[0-9]\+}/version = {$1}/" README.md &&
     sed -i "s/version=\".*\"/version=\"$1\"/" wasmer.toml
diff --git a/.gitmodules b/.gitmodules
@@ -3,7 +3,4 @@
 	url = https://github.com/ashvardanian/simsimd
 [submodule "fp16"]
 	path = fp16
-	url = https://github.com/maratyszcza/fp16
-[submodule "robin-map"]
-	path = robin-map
-	url = https://github.com/tessil/robin-map
+	url = https://github.com/maratyszcza/fp16
diff --git a/Package.swift b/Package.swift
@@ -19,20 +19,21 @@ let package = Package(
             cxxSettings: [
                 .headerSearchPath("../include/"),
                 .headerSearchPath("../fp16/include/"),
-                .headerSearchPath("../robin-map/include/"),
                 .headerSearchPath("../simismd/include/")
             ]
         ),
         .target(
             name: "USearch",
             dependencies: ["USearchObjective"],
             path: "swift",
+            exclude: ["README.md", "Test.swift"],
             sources: ["USearch.swift", "Index+Sugar.swift"]
         ),
         .testTarget(
             name: "USearchTests",
             dependencies: ["USearch"],
             path: "swift",
+            exclude: ["USearch.swift", "Index+Sugar.swift", "README.md"],
             sources: ["Test.swift"]
         )
     ],

diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 <h1 align="center">USearch</h1>
 <h3 align="center">
-Smaller & Faster Single-File<br/>
-Vector Search Engine<br/>
+Faster & Smaller Single-File<br/>
+Search Engine for Vectors & Texts<br/>
 </h3>
 <br/>
 
@@ -18,7 +18,7 @@ Vector Search Engine<br/>
 </p>
 
 <p align="center">
-Euclidean • Angular • Jaccard • Hamming • Haversine • User-Defined Metrics
+Euclidean • Angular • Bitwise • Haversine • User-Defined Metrics
 <br/>
 <a href="https://unum-cloud.github.io/usearch/cpp">C++ 11</a> •
 <a href="https://unum-cloud.github.io/usearch/python">Python 3</a> •
@@ -31,9 +31,16 @@ Euclidean • Angular • Jaccard • Hamming • Haversine • User-Defined Met
 <a href="https://unum-cloud.github.io/usearch/golang">GoLang</a> •
 <a href="https://unum-cloud.github.io/usearch/wolfram">Wolfram</a>
 <br/>
-Linux • MacOS • Windows • Docker • WebAssembly
+Linux • MacOS • Windows • iOS • Docker • WebAssembly
 </p>
 
+<div align="center">
+<a href="https://pypi.org/project/usearch/"> <img alt="PyPI - Downloads" src="https://img.shields.io/pypi/dm/usearch?label=pypi%20downloads"> </a>
+<a href="https://www.npmjs.com/package/usearch"> <img alt="npm" src="https://img.shields.io/npm/dy/usearch?label=npm%20dowloads"> </a>
+<a href="https://crates.io/crates/usearch"> <img alt="Crates.io" src="https://img.shields.io/crates/d/usearch?label=crate%20downloads"> </a>
+<img alt="GitHub code size in bytes" src="https://img.shields.io/github/languages/code-size/unum-cloud/usearch">
+</div>
+
 ---
 
 - ✅ Benchmark-topping performance.
@@ -42,10 +49,11 @@ Linux • MacOS • Windows • Docker • WebAssembly
 - ✅ Variable dimensionality vectors for unique applications, including search over compressed data.
 - ✅ Bitwise Tanimoto and Sorensen coefficients for [Genomics and Chemistry applications](#usearch--rdkit--molecular-search).
 - ✅ Hardware-agnostic `f16` & `i8` - [half-precision & quarter-precision support](#memory-efficiency-downcasting-and-quantization).
-- ✅ [View large indexes from disk](#disk-based-indexes) without loading into RAM.
+- ✅ [View large indexes from disk](#serving-index-from-disk) without loading into RAM.
 - ✅ Space-efficient point-clouds with `uint40_t`, accommodating 4B+ size.
 - ✅ Compatible with OpenMP and custom "executors", for fine-grained control over CPU utilization.
 - ✅ Heterogeneous lookups, renaming/relabeling, and on-the-fly deletions.
+- ✅ Near-real-time [clustering and sub-clusterings](#clustering) for Tens or Millions of clusters.
 - ✅ [Semantic Search](#usearch--ai--multi-modal-semantic-search) and [Joins](#joins).
 
 [usearch-header]: https://github.com/unum-cloud/usearch/blob/main/include/usearch/index.hpp
@@ -57,14 +65,15 @@ FAISS is a widely recognized standard for high-performance vector search engines
 USearch and FAISS both employ the same HNSW algorithm, but they differ significantly in their design principles.
 USearch is compact and broadly compatible without sacrificing performance, with a primary focus on user-defined metrics and fewer dependencies.
 
-|                    | FAISS                         | USearch                            |
-| :----------------- | :---------------------------- | :--------------------------------- |
-| Implementation     | 84 K [SLOC][sloc] in `faiss/` | 3 K [SLOC][sloc] in `usearch/`     |
-| Supported metrics  | 9 fixed metrics               | Any User-Defined metrics           |
-| Supported ID types | `uint32_t`, `uint64_t`        | `uint32_t`, `uint40_t`, `uint64_t` |
-| Dependencies       | BLAS, OpenMP                  | None                               |
-| Bindings           | SWIG                          | Native                             |
-| Acceleration       | Learned Quantization          | Downcasting                        |
+|                     | FAISS                         | USearch                            |
+| :------------------ | :---------------------------- | :--------------------------------- |
+| Implementation      | 84 K [SLOC][sloc] in `faiss/` | 3 K [SLOC][sloc] in `usearch/`     |
+| Supported metrics   | 9 fixed metrics               | Any User-Defined metrics           |
+| Supported languages | C++, Python                   | 10 languages                       |
+| Supported ID types  | `uint32_t`, `uint64_t`        | `uint32_t`, `uint40_t`, `uint64_t` |
+| Dependencies        | BLAS, OpenMP                  | None                               |
+| Bindings            | SWIG                          | Native                             |
+| Acceleration        | Learned Quantization          | Downcasting                        |
 
 [sloc]: https://en.wikipedia.org/wiki/Source_lines_of_code
 
@@ -96,6 +105,25 @@ assert matches[0].distance <= 0.001
 assert np.allclose(index[42], vector)
 ```
 
+Comparing the performance of FAISS against USearch on 1 Million 96-dimensional vectors from the famous Deep1B dataset, once can expect the following numbers on modern AWS `c7g.metal` instances.
+
+|              | FAISS, `f32` | USearch, `f32` | USearch, `f16` |     USearch, `i8` |
+| :----------- | -----------: | -------------: | -------------: | ----------------: |
+| Batch Insert |       16 K/s |         73 K/s |        100 K/s | 104 K/s **+550%** |
+| Batch Search |       82 K/s |        103 K/s |        113 K/s |  134 K/s **+63%** |
+| Bulk Insert  |       76 K/s |        105 K/s |        115 K/s | 202 K/s **+165%** |
+| Bulk Search  |      118 K/s |        174 K/s |        173 K/s | 304 K/s **+157%** |
+| Recall @ 10  |          99% |          99.2% |          99.1% |             99.2% |
+
+> HNSW was configured with identical hyper-parameters:
+> connectivity `M=16`,
+> expansion @ construction `efConstruction=128`,
+> and expansion @ search `ef=64`.
+> Batch size is 256.
+> Jump to the [Performance Tuning][benchmarking] section to read about the effects of those hyper-parameters.
+
+[benchmarking]: https://github.com/unum-cloud/usearch/blob/main/docs/benchmarks.md
+
 ## User-Defined Functions
 
 While most vector search packages concentrate on just a couple of metrics - "Inner Product distance" and "Euclidean distance," USearch extends this list to include any user-defined metrics.
@@ -124,27 +152,7 @@ Instead, we have focused on high-precision arithmetic over low-precision downcas
 The same index, and `add` and `search` operations will automatically down-cast or up-cast between `f32_t`, `f16_t`, `f64_t`, and `i8_t` representations, even if the hardware doesn't natively support it.
 Continuing the topic of memory efficiency, we provide a `uint40_t` to allow collection with over 4B+ vectors without allocating 8 bytes for every neighbor reference in the proximity graph.
 
-|              | FAISS, `f32` | USearch, `f32` | USearch, `f16` |     USearch, `i8` |
-| :----------- | -----------: | -------------: | -------------: | ----------------: |
-| Batch Insert |       16 K/s |         73 K/s |        100 K/s | 104 K/s **+550%** |
-| Batch Search |       82 K/s |        103 K/s |        113 K/s |  134 K/s **+63%** |
-| Bulk Insert  |       76 K/s |        105 K/s |        115 K/s | 202 K/s **+165%** |
-| Bulk Search  |      118 K/s |        174 K/s |        173 K/s | 304 K/s **+157%** |
-| Recall @ 10  |          99% |          99.2% |          99.1% |             99.2% |
-
-> Dataset: 1M vectors sample of the Deep1B dataset.
-> Hardware: `c7g.metal` AWS instance with 64 cores and DDR5 memory.
-> HNSW was configured with identical hyper-parameters:
-> connectivity `M=16`,
-> expansion @ construction `efConstruction=128`,
-> and expansion @ search `ef=64`.
-> Batch size is 256.
-> Both libraries were compiled for the target architecture.
-> Jump to the [Performance Tuning][benchmarking] section to read about the effects of those hyper-parameters.
-
-[benchmarking]: https://github.com/unum-cloud/usearch/blob/main/docs/benchmarks.md
-
-## Disk-based Indexes
+## Serving `Index` from Disk
 
 With USearch, you can serve indexes from external memory, enabling you to optimize your server choices for indexing speed and serving costs.
 This can result in **20x cost reduction** on AWS and other public clouds.
@@ -159,7 +167,7 @@ other_view = Index(ndim=..., metric=CompiledMetric(...))
 other_view.view("index.usearch")
 ```
 
-## Exact, Approximate, and Multi-Index Lookups
+## Exact vs. Approximate Search
 
 Approximate search methods, such as HNSW, are predominantly used when an exact brute-force search becomes too resource-intensive.
 This typically occurs when you have millions of entries in a collection.
@@ -183,6 +191,8 @@ When compared to FAISS's `IndexFlatL2` in Google Colab, **[USearch may offer up
 - `faiss.IndexFlatL2`: **55.3 ms**.
 - `usearch.index.search`: **2.54 ms**.
 
+## `Indexes` for Multi-Index Lookups
+
 For larger workloads targeting billions or even trillions of vectors, parallel multi-index lookups become invaluable.
 These lookups prevent the need to construct a single, massive index, allowing users to query multiple smaller ones instead.
 
@@ -198,6 +208,36 @@ multi_index = Indexes(
 multi_index.search(...)
 ```
 
+## Clustering
+
+Once the index is constructed, it can be used to cluster entries much faster.
+In essense, the `Index` itself can be seen as a clustering, and it allows iterative deepening.
+
+```py
+clustering = index.cluster(
+    min_count=10, # Optional
+    max_count=15, # Optional
+    threads=..., # Optional
+)
+
+# Get the clusters and their sizes
+centroid_keys, sizes = clustering.centroids_popularity
+
+# Use Matplotlib draw a histogram
+clustering.plot_centroids_popularity()
+
+# Export a NetworkX graph of the clusters
+g = clustering.network
+
+# Get members of a specific cluster
+first_members = clustering.members_of(centroid_keys[0])
+
+# Deepen into that cluster spliting it into more parts, all same arguments supported
+sub_clustering = clustering.subcluster(min_count=..., max_count=...)
+```
+
+Using Scikit-Learn, on a 1 Million point dataset, one may expect queries to take anywhere from minutes to hours, depending on the number of clusters you want to highlight. For 50'000 clusters the performance difference between USearch and conventional clustering methods may easily reach 100x.
+
 ## Joins, One-to-One, One-to-Many, and Many-to-Many Mappings
 
 One of the big questions these days is how will AI change the world of databases and data management.
@@ -229,7 +269,7 @@ Broader functionality is ported per request.
 | Add, search             |   ✅    |    ✅     |   ✅   |   ✅   |     ✅      |   ✅   |   ✅    |   ✅   |
 | Save, load, view        |   ✅    |    ✅     |   ✅   |   ✅   |     ✅      |   ✅   |   ✅    |   ✅   |
 | User-defined metrics    |   ✅    |    ✅     |   ✅   |   ❌   |     ❌      |   ❌   |   ❌    |   ❌   |
-| Joins                    |   ✅    |    ✅     |   ❌   |   ❌   |     ❌      |   ❌   |   ❌    |   ❌   |
+| Joins                   |   ✅    |    ✅     |   ❌   |   ❌   |     ❌      |   ❌   |   ❌    |   ❌   |
 | Variable-length vectors |   ✅    |    ❌     |   ❌   |   ❌   |     ❌      |   ❌   |   ❌    |   ❌   |
 | 4B+ capacities          |   ✅    |    ❌     |   ❌   |   ❌   |     ❌      |   ❌   |   ❌    |   ❌   |
 
@@ -316,13 +356,17 @@ matches = index.search(fingerprints, 10)
 [smiles]: https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system
 [rdkit-fingerprints]: https://www.rdkit.org/docs/RDKit_Book.html#additional-information-about-the-fingerprints
 
+### USearch + POI Coordinates = GIS Applications... on iOS?
+
+With Objective-C and iOS bindings, USearch can be easily used in mobile applications
+
 
 ## Integrations
 
-- [x] GPT-Cache.
-- [x] LangChain.
-- [ ] ClickHouse.
-- [ ] Microsoft Semantic Kernel.
+- [x] GPTCache: [Python](https://github.com/zilliztech/GPTCache/releases/tag/0.1.29).
+- [x] LangChain: [Python](https://github.com/langchain-ai/langchain/releases/tag/v0.0.257) and [JavaScipt](https://github.com/hwchase17/langchainjs/releases/tag/0.0.125).
+- [x] ClickHouse: [C++](https://github.com/ClickHouse/ClickHouse/pull/53447).
+- [ ] Microsoft Semantic Kernel: [Python](https://github.com/microsoft/semantic-kernel/pull/2358) and C#.
 
 ## Citations
 
@@ -332,8 +376,8 @@ doi = {10.5281/zenodo.7949416},
 author = {Vardanian, Ash},
 title = {{USearch by Unum Cloud}},
 url = {https://github.com/unum-cloud/usearch},
-version = {0.13.0},
-year = {2022}
+version = {1.0.0},
+year = {2022},
 month = jun,
 }
 ```
diff --git a/binding.gyp b/binding.gyp
@@ -2,32 +2,36 @@
     "targets": [
         {
             "target_name": "usearch",
-            "sources": [
-                "javascript/lib.cpp"
-            ],
+            "sources": ["javascript/lib.cpp"],
             "include_dirs": [
                 "<!@(node -p \"require('node-addon-api').include\")",
                 "include",
                 "fp16/include",
-                "robin-map/include",
-                "simsimd/include"
+                "simsimd/include",
+            ],
+            "dependencies": ["<!(node -p \"require('node-addon-api').gyp\")"],
+            "cflags": [
+                "-fexceptions",
+                "-Wno-unknown-pragmas",
+                "-Wno-maybe-uninitialized",
             ],
-            "dependencies": [
-                "<!(node -p \"require('node-addon-api').gyp\")"
+            "cflags_cc": [
+                "-fexceptions",
+                "-Wno-unknown-pragmas",
+                "-Wno-maybe-uninitialized",
+                "-std=c++11",
             ],
-            "cflags": ["-fexceptions", "-Wno-unknown-pragmas", "-Wno-maybe-uninitialized"],
-            "cflags_cc": ["-fexceptions", "-Wno-unknown-pragmas", "-Wno-maybe-uninitialized", "-std=c++11"],
             "xcode_settings": {
                 "GCC_ENABLE_CPP_EXCEPTIONS": "YES",
                 "CLANG_CXX_LIBRARY": "libc++",
-                "MACOSX_DEPLOYMENT_TARGET": "10.15"
+                "MACOSX_DEPLOYMENT_TARGET": "10.15",
             },
             "msvs_settings": {
                 "VCCLCompilerTool": {
                     "ExceptionHandling": 1,
-                    "AdditionalOptions": ["-std:c++11"]
+                    "AdditionalOptions": ["-std:c++11"],
                 }
-            }
+            },
         }
     ]
 }
diff --git a/build.gradle b/build.gradle
@@ -45,7 +45,7 @@ model {
                         include "**/*.cpp"
                     }
                     exportedHeaders {
-                        srcDirs "include", "fp16/include", "robin-map/include", "simsimd/include", "${Jvm.current().javaHome}/include"
+                        srcDirs "include", "fp16/include", "simsimd/include", "${Jvm.current().javaHome}/include"
                     }
                 }
             }

diff --git a/build.rs b/build.rs
@@ -6,7 +6,6 @@ fn main() {
         .include("include")
         .include("rust")
         .include("fp16/include")
-        .include("robin-map/include")
         .include("simsimd/include")
         .compile("usearch");