Skip to content

Commit

Permalink
Merge pull request #212 from unum-cloud/main-dev
Browse files Browse the repository at this point in the history
Clustering & Performance Improvement
  • Loading branch information
ashvardanian authored Aug 24, 2023
2 parents b7ec32c + 58b043c commit 55e4b05
Show file tree
Hide file tree
Showing 49 changed files with 3,317 additions and 1,290 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/prerelease.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ jobs:
- name: Build locally
run: python -m pip install .
- name: Test with PyTest
run: pytest python/scripts/test.py
run: pytest python/scripts/


test_python_37:
Expand Down Expand Up @@ -95,7 +95,7 @@ jobs:
run: python -m pip install .

- name: Test with PyTest
run: pytest python/scripts/test.py
run: pytest python/scripts/


test_javascript:
Expand Down Expand Up @@ -160,7 +160,7 @@ jobs:
sudo apt install -y nodejs
git clone https://github.com/emscripten-core/emsdk.git
./emsdk/emsdk install latest
- name: Build USearch by Emscripten
- name: Build USearch using Emscripten
run: |
./emsdk/emsdk activate latest && source ./emsdk/emsdk_env.sh
emcmake cmake -DUSEARCH_BUILD_BENCHMARK=0 -DUSEARCH_BUILD_WASM=1 -B ./build -DCMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -s TOTAL_MEMORY=64MB" && emmake make -C ./build
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -111,13 +111,13 @@ jobs:
- uses: actions/setup-python@v3

- name: Setup Docker
if: matrix.os != 'windows-2022'
if: matrix.os == 'ubuntu-20.04'
uses: crazy-max/ghaction-setup-docker@v1.0.0
with:
version: 23.0.1

- name: Setup QEMU
if: matrix.os != 'windows-2022'
if: matrix.os == 'ubuntu-20.04'
uses: docker/setup-qemu-action@v2.1.0

- name: Install CIBuildWheel
Expand Down Expand Up @@ -319,7 +319,7 @@ jobs:
run: |
sudo apt update &&
sudo apt install -y doxygen graphviz dia git &&
pip install sphinx sphinx-js breathe furo m2r2 sphinxcontrib-googleanalytics==0.2.dev20220708 sphinxcontrib-jquery &&
pip install sphinx==7.1.2 sphinx-js breathe furo m2r2 sphinxcontrib-googleanalytics==0.2.dev20220708 sphinxcontrib-jquery &&
npm install -g jsdoc
- name: Install USearch from PyPi
run: pip install usearch
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/update_version.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ echo $1 > VERSION &&
sed -i "s/^\(#define USEARCH_VERSION_MINOR \).*/\1$(echo "$1" | cut -d. -f2)/" ./include/usearch/index.hpp &&
sed -i "s/^\(#define USEARCH_VERSION_PATCH \).*/\1$(echo "$1" | cut -d. -f3)/" ./include/usearch/index.hpp &&
sed -i "s/<version>[0-9]\+\.[0-9]\+\.[0-9]\+/<version>$1/" README.md &&
sed -i "s/version = {0\.[0-9]\+\.[0-9]\+}/version = {$1}/" README.md &&
sed -i "s/version = {[0-9]\+\.[0-9]\+\.[0-9]\+}/version = {$1}/" README.md &&
sed -i "s/version=\".*\"/version=\"$1\"/" wasmer.toml
5 changes: 1 addition & 4 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,4 @@
url = https://github.com/ashvardanian/simsimd
[submodule "fp16"]
path = fp16
url = https://github.com/maratyszcza/fp16
[submodule "robin-map"]
path = robin-map
url = https://github.com/tessil/robin-map
url = https://github.com/maratyszcza/fp16
3 changes: 2 additions & 1 deletion Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,21 @@ let package = Package(
cxxSettings: [
.headerSearchPath("../include/"),
.headerSearchPath("../fp16/include/"),
.headerSearchPath("../robin-map/include/"),
.headerSearchPath("../simismd/include/")
]
),
.target(
name: "USearch",
dependencies: ["USearchObjective"],
path: "swift",
exclude: ["README.md", "Test.swift"],
sources: ["USearch.swift", "Index+Sugar.swift"]
),
.testTarget(
name: "USearchTests",
dependencies: ["USearch"],
path: "swift",
exclude: ["USearch.swift", "Index+Sugar.swift", "README.md"],
sources: ["Test.swift"]
)
],
Expand Down
128 changes: 86 additions & 42 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<h1 align="center">USearch</h1>
<h3 align="center">
Smaller & Faster Single-File<br/>
Vector Search Engine<br/>
Faster & Smaller Single-File<br/>
Search Engine for Vectors & Texts<br/>
</h3>
<br/>

Expand All @@ -18,7 +18,7 @@ Vector Search Engine<br/>
</p>

<p align="center">
Euclidean • Angular • Jaccard • Hamming • Haversine • User-Defined Metrics
Euclidean • Angular • Bitwise • Haversine • User-Defined Metrics
<br/>
<a href="https://unum-cloud.github.io/usearch/cpp">C++ 11</a> •
<a href="https://unum-cloud.github.io/usearch/python">Python 3</a> •
Expand All @@ -31,9 +31,16 @@ Euclidean • Angular • Jaccard • Hamming • Haversine • User-Defined Met
<a href="https://unum-cloud.github.io/usearch/golang">GoLang</a> •
<a href="https://unum-cloud.github.io/usearch/wolfram">Wolfram</a>
<br/>
Linux • MacOS • Windows • Docker • WebAssembly
Linux • MacOS • Windows • iOS • Docker • WebAssembly
</p>

<div align="center">
<a href="https://pypi.org/project/usearch/"> <img alt="PyPI - Downloads" src="https://img.shields.io/pypi/dm/usearch?label=pypi%20downloads"> </a>
<a href="https://www.npmjs.com/package/usearch"> <img alt="npm" src="https://img.shields.io/npm/dy/usearch?label=npm%20dowloads"> </a>
<a href="https://crates.io/crates/usearch"> <img alt="Crates.io" src="https://img.shields.io/crates/d/usearch?label=crate%20downloads"> </a>
<img alt="GitHub code size in bytes" src="https://img.shields.io/github/languages/code-size/unum-cloud/usearch">
</div>

---

- ✅ Benchmark-topping performance.
Expand All @@ -42,10 +49,11 @@ Linux • MacOS • Windows • Docker • WebAssembly
- ✅ Variable dimensionality vectors for unique applications, including search over compressed data.
- ✅ Bitwise Tanimoto and Sorensen coefficients for [Genomics and Chemistry applications](#usearch--rdkit--molecular-search).
- ✅ Hardware-agnostic `f16` & `i8` - [half-precision & quarter-precision support](#memory-efficiency-downcasting-and-quantization).
-[View large indexes from disk](#disk-based-indexes) without loading into RAM.
-[View large indexes from disk](#serving-index-from-disk) without loading into RAM.
- ✅ Space-efficient point-clouds with `uint40_t`, accommodating 4B+ size.
- ✅ Compatible with OpenMP and custom "executors", for fine-grained control over CPU utilization.
- ✅ Heterogeneous lookups, renaming/relabeling, and on-the-fly deletions.
- ✅ Near-real-time [clustering and sub-clusterings](#clustering) for Tens or Millions of clusters.
-[Semantic Search](#usearch--ai--multi-modal-semantic-search) and [Joins](#joins).

[usearch-header]: https://github.com/unum-cloud/usearch/blob/main/include/usearch/index.hpp
Expand All @@ -57,14 +65,15 @@ FAISS is a widely recognized standard for high-performance vector search engines
USearch and FAISS both employ the same HNSW algorithm, but they differ significantly in their design principles.
USearch is compact and broadly compatible without sacrificing performance, with a primary focus on user-defined metrics and fewer dependencies.

| | FAISS | USearch |
| :----------------- | :---------------------------- | :--------------------------------- |
| Implementation | 84 K [SLOC][sloc] in `faiss/` | 3 K [SLOC][sloc] in `usearch/` |
| Supported metrics | 9 fixed metrics | Any User-Defined metrics |
| Supported ID types | `uint32_t`, `uint64_t` | `uint32_t`, `uint40_t`, `uint64_t` |
| Dependencies | BLAS, OpenMP | None |
| Bindings | SWIG | Native |
| Acceleration | Learned Quantization | Downcasting |
| | FAISS | USearch |
| :------------------ | :---------------------------- | :--------------------------------- |
| Implementation | 84 K [SLOC][sloc] in `faiss/` | 3 K [SLOC][sloc] in `usearch/` |
| Supported metrics | 9 fixed metrics | Any User-Defined metrics |
| Supported languages | C++, Python | 10 languages |
| Supported ID types | `uint32_t`, `uint64_t` | `uint32_t`, `uint40_t`, `uint64_t` |
| Dependencies | BLAS, OpenMP | None |
| Bindings | SWIG | Native |
| Acceleration | Learned Quantization | Downcasting |

[sloc]: https://en.wikipedia.org/wiki/Source_lines_of_code

Expand Down Expand Up @@ -96,6 +105,25 @@ assert matches[0].distance <= 0.001
assert np.allclose(index[42], vector)
```

Comparing the performance of FAISS against USearch on 1 Million 96-dimensional vectors from the famous Deep1B dataset, once can expect the following numbers on modern AWS `c7g.metal` instances.

| | FAISS, `f32` | USearch, `f32` | USearch, `f16` | USearch, `i8` |
| :----------- | -----------: | -------------: | -------------: | ----------------: |
| Batch Insert | 16 K/s | 73 K/s | 100 K/s | 104 K/s **+550%** |
| Batch Search | 82 K/s | 103 K/s | 113 K/s | 134 K/s **+63%** |
| Bulk Insert | 76 K/s | 105 K/s | 115 K/s | 202 K/s **+165%** |
| Bulk Search | 118 K/s | 174 K/s | 173 K/s | 304 K/s **+157%** |
| Recall @ 10 | 99% | 99.2% | 99.1% | 99.2% |

> HNSW was configured with identical hyper-parameters:
> connectivity `M=16`,
> expansion @ construction `efConstruction=128`,
> and expansion @ search `ef=64`.
> Batch size is 256.
> Jump to the [Performance Tuning][benchmarking] section to read about the effects of those hyper-parameters.
[benchmarking]: https://github.com/unum-cloud/usearch/blob/main/docs/benchmarks.md

## User-Defined Functions

While most vector search packages concentrate on just a couple of metrics - "Inner Product distance" and "Euclidean distance," USearch extends this list to include any user-defined metrics.
Expand Down Expand Up @@ -124,27 +152,7 @@ Instead, we have focused on high-precision arithmetic over low-precision downcas
The same index, and `add` and `search` operations will automatically down-cast or up-cast between `f32_t`, `f16_t`, `f64_t`, and `i8_t` representations, even if the hardware doesn't natively support it.
Continuing the topic of memory efficiency, we provide a `uint40_t` to allow collection with over 4B+ vectors without allocating 8 bytes for every neighbor reference in the proximity graph.

| | FAISS, `f32` | USearch, `f32` | USearch, `f16` | USearch, `i8` |
| :----------- | -----------: | -------------: | -------------: | ----------------: |
| Batch Insert | 16 K/s | 73 K/s | 100 K/s | 104 K/s **+550%** |
| Batch Search | 82 K/s | 103 K/s | 113 K/s | 134 K/s **+63%** |
| Bulk Insert | 76 K/s | 105 K/s | 115 K/s | 202 K/s **+165%** |
| Bulk Search | 118 K/s | 174 K/s | 173 K/s | 304 K/s **+157%** |
| Recall @ 10 | 99% | 99.2% | 99.1% | 99.2% |

> Dataset: 1M vectors sample of the Deep1B dataset.
> Hardware: `c7g.metal` AWS instance with 64 cores and DDR5 memory.
> HNSW was configured with identical hyper-parameters:
> connectivity `M=16`,
> expansion @ construction `efConstruction=128`,
> and expansion @ search `ef=64`.
> Batch size is 256.
> Both libraries were compiled for the target architecture.
> Jump to the [Performance Tuning][benchmarking] section to read about the effects of those hyper-parameters.
[benchmarking]: https://github.com/unum-cloud/usearch/blob/main/docs/benchmarks.md

## Disk-based Indexes
## Serving `Index` from Disk

With USearch, you can serve indexes from external memory, enabling you to optimize your server choices for indexing speed and serving costs.
This can result in **20x cost reduction** on AWS and other public clouds.
Expand All @@ -159,7 +167,7 @@ other_view = Index(ndim=..., metric=CompiledMetric(...))
other_view.view("index.usearch")
```

## Exact, Approximate, and Multi-Index Lookups
## Exact vs. Approximate Search

Approximate search methods, such as HNSW, are predominantly used when an exact brute-force search becomes too resource-intensive.
This typically occurs when you have millions of entries in a collection.
Expand All @@ -183,6 +191,8 @@ When compared to FAISS's `IndexFlatL2` in Google Colab, **[USearch may offer up
- `faiss.IndexFlatL2`: **55.3 ms**.
- `usearch.index.search`: **2.54 ms**.

## `Indexes` for Multi-Index Lookups

For larger workloads targeting billions or even trillions of vectors, parallel multi-index lookups become invaluable.
These lookups prevent the need to construct a single, massive index, allowing users to query multiple smaller ones instead.

Expand All @@ -198,6 +208,36 @@ multi_index = Indexes(
multi_index.search(...)
```

## Clustering

Once the index is constructed, it can be used to cluster entries much faster.
In essense, the `Index` itself can be seen as a clustering, and it allows iterative deepening.

```py
clustering = index.cluster(
min_count=10, # Optional
max_count=15, # Optional
threads=..., # Optional
)

# Get the clusters and their sizes
centroid_keys, sizes = clustering.centroids_popularity

# Use Matplotlib draw a histogram
clustering.plot_centroids_popularity()

# Export a NetworkX graph of the clusters
g = clustering.network

# Get members of a specific cluster
first_members = clustering.members_of(centroid_keys[0])

# Deepen into that cluster spliting it into more parts, all same arguments supported
sub_clustering = clustering.subcluster(min_count=..., max_count=...)
```

Using Scikit-Learn, on a 1 Million point dataset, one may expect queries to take anywhere from minutes to hours, depending on the number of clusters you want to highlight. For 50'000 clusters the performance difference between USearch and conventional clustering methods may easily reach 100x.

## Joins, One-to-One, One-to-Many, and Many-to-Many Mappings

One of the big questions these days is how will AI change the world of databases and data management.
Expand Down Expand Up @@ -229,7 +269,7 @@ Broader functionality is ported per request.
| Add, search |||||||||
| Save, load, view |||||||||
| User-defined metrics |||||||||
| Joins |||||||||
| Joins |||||||||
| Variable-length vectors |||||||||
| 4B+ capacities |||||||||

Expand Down Expand Up @@ -316,13 +356,17 @@ matches = index.search(fingerprints, 10)
[smiles]: https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system
[rdkit-fingerprints]: https://www.rdkit.org/docs/RDKit_Book.html#additional-information-about-the-fingerprints

### USearch + POI Coordinates = GIS Applications... on iOS?

With Objective-C and iOS bindings, USearch can be easily used in mobile applications


## Integrations

- [x] GPT-Cache.
- [x] LangChain.
- [ ] ClickHouse.
- [ ] Microsoft Semantic Kernel.
- [x] GPTCache: [Python](https://github.com/zilliztech/GPTCache/releases/tag/0.1.29).
- [x] LangChain: [Python](https://github.com/langchain-ai/langchain/releases/tag/v0.0.257) and [JavaScipt](https://github.com/hwchase17/langchainjs/releases/tag/0.0.125).
- [x] ClickHouse: [C++](https://github.com/ClickHouse/ClickHouse/pull/53447).
- [ ] Microsoft Semantic Kernel: [Python](https://github.com/microsoft/semantic-kernel/pull/2358) and C#.

## Citations

Expand All @@ -332,8 +376,8 @@ doi = {10.5281/zenodo.7949416},
author = {Vardanian, Ash},
title = {{USearch by Unum Cloud}},
url = {https://github.com/unum-cloud/usearch},
version = {0.13.0},
year = {2022}
version = {1.0.0},
year = {2022},
month = jun,
}
```
28 changes: 16 additions & 12 deletions binding.gyp
Original file line number Diff line number Diff line change
Expand Up @@ -2,32 +2,36 @@
"targets": [
{
"target_name": "usearch",
"sources": [
"javascript/lib.cpp"
],
"sources": ["javascript/lib.cpp"],
"include_dirs": [
"<!@(node -p \"require('node-addon-api').include\")",
"include",
"fp16/include",
"robin-map/include",
"simsimd/include"
"simsimd/include",
],
"dependencies": ["<!(node -p \"require('node-addon-api').gyp\")"],
"cflags": [
"-fexceptions",
"-Wno-unknown-pragmas",
"-Wno-maybe-uninitialized",
],
"dependencies": [
"<!(node -p \"require('node-addon-api').gyp\")"
"cflags_cc": [
"-fexceptions",
"-Wno-unknown-pragmas",
"-Wno-maybe-uninitialized",
"-std=c++11",
],
"cflags": ["-fexceptions", "-Wno-unknown-pragmas", "-Wno-maybe-uninitialized"],
"cflags_cc": ["-fexceptions", "-Wno-unknown-pragmas", "-Wno-maybe-uninitialized", "-std=c++11"],
"xcode_settings": {
"GCC_ENABLE_CPP_EXCEPTIONS": "YES",
"CLANG_CXX_LIBRARY": "libc++",
"MACOSX_DEPLOYMENT_TARGET": "10.15"
"MACOSX_DEPLOYMENT_TARGET": "10.15",
},
"msvs_settings": {
"VCCLCompilerTool": {
"ExceptionHandling": 1,
"AdditionalOptions": ["-std:c++11"]
"AdditionalOptions": ["-std:c++11"],
}
}
},
}
]
}
2 changes: 1 addition & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ model {
include "**/*.cpp"
}
exportedHeaders {
srcDirs "include", "fp16/include", "robin-map/include", "simsimd/include", "${Jvm.current().javaHome}/include"
srcDirs "include", "fp16/include", "simsimd/include", "${Jvm.current().javaHome}/include"
}
}
}
Expand Down
1 change: 0 additions & 1 deletion build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ fn main() {
.include("include")
.include("rust")
.include("fp16/include")
.include("robin-map/include")
.include("simsimd/include")
.compile("usearch");

Expand Down
Loading

0 comments on commit 55e4b05

Please sign in to comment.