Merge pull request #148 from unum-cloud/main-dev

Semantic Joins
unum-cloud · Jul 18, 2023 · cc86e43 · cc86e43
2 parents a895338 + 33df240
commit cc86e43
Show file tree

Hide file tree

Showing 15 changed files with 1,041 additions and 208 deletions.
diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
@@ -17,6 +17,33 @@ permissions:
 
 jobs:
 
+  build_test:
+    name: Test USearch
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+      with:
+        ref: main-dev
+    - run: git submodule update --init --recursive
+    - name: Prepare Environment
+      run: |
+        sudo apt update && 
+        sudo apt install -y cmake g++-12 build-essential libjemalloc-dev
+    - name: Build
+      run: |
+        cmake -B ./build_release \
+        -DCMAKE_CXX_COMPILER="g++-12" \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DUSEARCH_BUILD_TEST=1 \
+        -DUSEARCH_USE_OPENMP=1 \
+        -DUSEARCH_USE_SIMSIMD=1 \
+        -DUSEARCH_USE_JEMALLOC=1 \
+        -DUSEARCH_BUILD_BENCHMARK=0 &&
+        make -C ./build_release -j
+    - name: Run tests
+      run: ./build_release/test
+
+
   test_python_311:
     name: Test Python
     runs-on: ${{ matrix.os }}

diff --git a/README.md b/README.md
@@ -759,9 +759,28 @@ index = Index(ndim=2048, metric=MetricKind.BitwiseTanimoto)
 labels = np.arange(len(molecules))
 
 index.add(labels, fingerprints)
-matches = index.search(fingerprints, 10)
+matches: Matches = index.search(fingerprints, 10)
 ```
 
+Of if you need a bit of our SIMD superpowers - take some Chemsitry-oriented precompiled metrics from [SimSIMD][simsimd].
+
+```python
+import simsimd as sisi
+
+index = Index(
+    ndim=166, # MACCS fingerprints are 166-dimensional
+    metric=CompiledMetric(
+        pointer=sisi.to_int(sisi.tanimoto_maccs_neon), # For Arm Neon
+        signature=MetricSignature.ArrayArray,
+        kind=MetricKind.Tanimoto,
+    ),
+)
+
+index.add(42, np.packbits([1] * 166))
+matches: Matches = index.search(np.packbits([1] * 166))
+```
+
+[simsimd]: https://github.com/ashvardanian/simsimd
 [smiles]: https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system
 [rdkit-fingerprints]: https://www.rdkit.org/docs/RDKit_Book.html#additional-information-about-the-fingerprints
 

diff --git a/c/Makefile b/c/Makefile
@@ -1,16 +1,17 @@
 CC = gcc-12
+CXX = g++-12
 
 C_FLAGS = -std=c99
+CXX_FLAGS = -std=c++17
 
-C_CXX_FLAGS = -Wall -Wextra -Wno-conversion -Wno-unknown-pragmas
-C_CXX_FLAGS += -O3 -march=native
+CXX_FLAGS += -Wall -Wextra -Wno-conversion -Wno-unknown-pragmas -O3 -march=native
 
 HEADER_INCLUDES = -I.  -I ../include/  -I ../fp16/include/ -I ../robin-map/include/
 
 .PHONY: build
 build:
-	$(CC) $(C_FLAGS) $(C_CXX_FLAGS) -o libusearch.so -O3 lib.cpp $(HEADER_INCLUDES) -shared -fPIC
+	$(CXX) $(CXX_FLAGS) -o libusearch.so -O3 lib.cpp $(HEADER_INCLUDES) -shared -fPIC
 
 .PHONY: test
 test:
-	$(CC) $(C_FLAGS) $(C_CXX_FLAGS) test.c -L. -lusearch -Wl,-rpath,. -o test
+	$(CC) $(C_FLAGS) test.c -L. -lusearch -Wl,-rpath,. -o test
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -13,7 +13,9 @@ endif()
 # https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html
 if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
-    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address")
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g")
+    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3 -g")
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address -fsanitize=leak -fsanitize=alignment -fsanitize=undefined")
     set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wall -Wextra -Wno-conversion -Wno-unknown-pragmas")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmax-errors=1")
@@ -22,7 +24,9 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
 
 elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
-    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address")
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g")
+    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3 -g")
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address -fsanitize=leak -fsanitize=alignment -fsanitize=undefined")
     set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wfatal-errors")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pedantic")

diff --git a/cpp/bench.cpp b/cpp/bench.cpp
@@ -24,6 +24,7 @@
 
 #include <sys/stat.h> // `stat`
 
+#include <algorithm>
 #include <csignal>
 #include <cstdio>
 #include <iostream>  // `std::cerr`
@@ -32,6 +33,7 @@
 #include <string>    // `std::to_string`
 #include <thread>    // `std::thread::hardware_concurrency()`
 #include <variant>   // `std::monostate`
+#include <vector>
 
 #include <clipp.h> // Command Line Interface
 #include <omp.h>   // `omp_set_num_threads()`
@@ -243,10 +245,10 @@ struct running_stats_printer_t {
         std::size_t new_progress = progress.load();
         if (new_progress - last_printed_progress < step)
             return;
-        print(new_progress);
+        print(new_progress, total);
     }
 
-    void print(std::size_t progress) {
+    void print(std::size_t progress, std::size_t total) {
 
         constexpr char bars_k[] = "||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||";
         constexpr std::size_t bars_len_k = 60;
@@ -267,6 +269,7 @@ struct running_stats_printer_t {
 
         last_printed_progress = progress;
         last_printed_time = time_new;
+        this->total = total;
     }
 };
 
@@ -332,6 +335,8 @@ template <typename dataset_at, typename index_at> //
 static void single_shot(dataset_at& dataset, index_at& index, bool construct = true) {
     using label_t = typename index_at::label_t;
     using distance_t = typename index_at::distance_t;
+    using join_result_t = typename index_at::join_result_t;
+    constexpr std::size_t missing_label = std::numeric_limits<label_t>::max();
 
     std::printf("\n");
     std::printf("------------\n");
@@ -348,7 +353,7 @@ static void single_shot(dataset_at& dataset, index_at& index, bool construct = t
     search_many(index, dataset.queries_count(), dataset.query(0), dataset.dimensions(), dataset.neighborhood_size(),
                 found_neighbors.data(), found_distances.data());
 
-    // Evaluate quality
+    // Evaluate search quality
     std::size_t recall_at_1 = 0, recall_full = 0;
     for (std::size_t i = 0; i != dataset.queries_count(); ++i) {
         auto expected = dataset.neighborhood(i);
@@ -360,6 +365,38 @@ static void single_shot(dataset_at& dataset, index_at& index, bool construct = t
     std::printf("Recall@1 %.2f %%\n", recall_at_1 * 100.f / dataset.queries_count());
     std::printf("Recall %.2f %%\n", recall_full * 100.f / dataset.queries_count());
 
+    // Perform joins
+    std::vector<label_t> man_to_woman(dataset.vectors_count());
+    std::vector<label_t> woman_to_man(dataset.vectors_count());
+    std::size_t join_attempts = 0;
+    {
+        index_at& men = index;
+        index_at women = index.copy().index;
+        std::fill(man_to_woman.begin(), man_to_woman.end(), missing_label);
+        std::fill(woman_to_man.begin(), woman_to_man.end(), missing_label);
+        {
+            executor_default_t executor(index.limits().threads());
+            running_stats_printer_t printer{1, "Join"};
+            join_result_t result = index_at::join(          //
+                men, women, join_config_t{executor.size()}, //
+                man_to_woman.data(), woman_to_man.data(),   //
+                executor, [&](std::size_t progress, std::size_t total) {
+                    if (progress % 1000 == 0)
+                        printer.print(progress, total);
+                });
+            join_attempts = result.cycles;
+        }
+    }
+    // Evaluate join quality
+    std::size_t recall_join = 0, unmatched_count = 0;
+    for (std::size_t i = 0; i != index.size(); ++i) {
+        recall_join += man_to_woman[i] == i;
+        unmatched_count += man_to_woman[i] == missing_label;
+    }
+    std::printf("Recall Joins %.2f %%\n", recall_join * 100.f / index.size());
+    std::printf("Unmatched %.2f %% (%zu items)\n", unmatched_count * 100.f / index.size(), unmatched_count);
+    std::printf("Proposals %.2f / man (%zu total)\n", join_attempts * 1.f / index.size(), join_attempts);
+
     // Paginate
     std::vector<vector_id_t> hints(dataset.queries_count());
     for (std::size_t i = 0; i != hints.size(); ++i)
@@ -491,7 +528,7 @@ void run_punned(dataset_at& dataset, args_t const& args, index_config_t config,
 
     std::printf("Will benchmark an on-disk view\n");
 
-    index_at index_view = index.fork();
+    index_at index_view = index.fork().index;
     index_view.view(args.path_output.c_str());
     single_shot(dataset, index_view, false);
 }

diff --git a/docs/benchmarks.md b/docs/benchmarks.md
@@ -117,21 +117,23 @@ OPTIONS
 BigANN benchmark is a good starting point, if you are searching for large collections of high-dimensional vectors.
 Those often come with precomputed ground-truth neighbors, which is handy for recall evaluation.
 
-| Dataset                                 | Scalar Type | Dimensions | Metric |   Size    |
-| :-------------------------------------- | :---------: | :--------: | :----: | :-------: |
-| [Unum UForm Wiki][unum-wiki]            |   float32   |    256     |   IP   |   1 GB    |
-| [Yandex Text-to-Image Sample][unum-t2i] |   float32   |    200     |  Cos   |   1 GB    |
-|                                         |             |            |        |           |
-| [Microsoft SPACEV][spacev]              |    int8     |    100     |   L2   |   93 GB   |
-| [Microsoft Turing-ANNS][turing]         |   float32   |    100     |   L2   |  373 GB   |
-| [Yandex Deep1B][deep]                   |   float32   |     96     |   L2   |  358 GB   |
-| [Yandex Text-to-Image][t2i]             |   float32   |    200     |  Cos   |  750 GB   |
-|                                         |             |            |        |           |
-| [ViT-L/12 LAION][laion]                 |   float32   |    2048    |  Cos   | 2 - 10 TB |
+| Dataset                                    | Scalar Type | Dimensions | Metric |   Size    |
+| :----------------------------------------- | :---------: | :--------: | :----: | :-------: |
+| [Unum UForm Creative Captions][unum-cc-3m] |   float32   |    256     |   IP   |   3 GB    |
+| [Unum UForm Wiki][unum-wiki-1m]            |   float32   |    256     |   IP   |   1 GB    |
+| [Yandex Text-to-Image Sample][unum-t2i]    |   float32   |    200     |  Cos   |   1 GB    |
+|                                            |             |            |        |           |
+| [Microsoft SPACEV][spacev]                 |    int8     |    100     |   L2   |   93 GB   |
+| [Microsoft Turing-ANNS][turing]            |   float32   |    100     |   L2   |  373 GB   |
+| [Yandex Deep1B][deep]                      |   float32   |     96     |   L2   |  358 GB   |
+| [Yandex Text-to-Image][t2i]                |   float32   |    200     |  Cos   |  750 GB   |
+|                                            |             |            |        |           |
+| [ViT-L/12 LAION][laion]                    |   float32   |    2048    |  Cos   | 2 - 10 TB |
 
 Luckily, smaller samples of those datasets are available.
 
-[unum-wiki]: https://huggingface.co/datasets/unum-cloud/ann-wiki-1m
+[unum-cc-3m]: https://huggingface.co/datasets/unum-cloud/ann-cc-3m
+[unum-wiki-1m]: https://huggingface.co/datasets/unum-cloud/ann-wiki-1m
 [unum-t2i]: https://huggingface.co/datasets/unum-cloud/ann-t2i-1m
 [spacev]: https://github.com/microsoft/SPTAG/tree/main/datasets/SPACEV1B
 [turing]: https://learning2hash.github.io/publications/microsoftturinganns1B/