COH-32073 - Add support for HnswIndex in ai module (#232)

dhirupandey · web-flow · commit f4f98bca8edc · 2025-04-09T12:07:08.000-07:00
* COH-32073 - Add support for HnswIndex in ai module
diff --git a/src/coherence/ai.py b/src/coherence/ai.py
@@ -7,7 +7,7 @@
 import base64
 from abc import ABC
 from collections import OrderedDict
-from typing import Any, Dict, List, Optional, TypeVar, Union, cast
+from typing import Any, Dict, Final, List, Optional, TypeVar, Union, cast
 
 import jsonpickle
 import numpy as np
@@ -337,6 +337,93 @@ def __init__(self, extractor: Union[ValueExtractor[T, E], str], over_sampling_fa
         self.oversamplingFactor = over_sampling_factor
 
 
+@proxy("coherence.hnsw.HnswIndex")
+class HnswIndex(AbstractEvolvable):
+    DEFAULT_SPACE_NAME: Final[str] = "COSINE"
+    """The default index space name."""
+
+    DEFAULT_MAX_ELEMENTS: Final[int] = 4096
+    """
+    The default maximum number of elements the index can contain is 4096
+    but the index will grow automatically by doubling its capacity until it
+    reaches approximately 8m elements, at which point it will grow by 50%
+    whenever it gets full.
+    """
+
+    DEFAULT_M: Final[int] = 16
+    """
+    The default number of bidirectional links created for every new
+    element during construction is 2-100. Higher M work better on datasets
+    with high intrinsic dimensionality and/or high recall, while low M work
+    better for datasets with low intrinsic dimensionality and/or low recalls.
+    The parameter also determines the algorithm's memory consumption,
+    which is roughly M * 8-10 bytes per stored element. As an example for
+    dim=4 random vectors optimal M for search is somewhere around 6,
+    while for high dimensional datasets (word embeddings, good face
+    descriptors), higher M are required (e.g. M=48-64) for optimal
+    performance at high recall. The range M=12-48 is ok for the most of the
+    use cases. When M is changed one has to update the other parameters.
+    Nonetheless, ef and ef_construction parameters can be roughly estimated
+    by assuming that M*ef_{construction} is a constant. The default value is
+    16.
+    """
+
+    DEFAULT_EF_CONSTRUCTION: Final[int] = 200
+    """
+    The parameter has the same meaning as ef, which controls the
+    index_time/index_accuracy. Bigger ef_construction leads to longer
+    construction, but better index quality. At some point, increasing
+    ef_construction does not improve the quality of the index. One way to
+    check if the selection of ef_construction was ok is to measure a recall
+    for M nearest neighbor search when ef =ef_construction: if the recall is
+    lower than 0.9, than there is room for improvement. The default value is
+    200.
+    """
+
+    DEFAULT_EF_SEARCH: Final[int] = 50
+    """
+    The parameter controlling query time/accuracy trade-off. The default
+    value is 50.
+    """
+
+    DEFAULT_RANDOM_SEED: Final[int] = 100
+    """The default random seed used for the index."""
+
+    def __init__(
+        self,
+        extractor: Union[ValueExtractor[T, E], str],
+        dimensions: int,
+        space_name: str = DEFAULT_SPACE_NAME,
+        max_elements: int = DEFAULT_MAX_ELEMENTS,
+        m: int = DEFAULT_M,
+        ef_construction: int = DEFAULT_EF_CONSTRUCTION,
+        ef_search: int = DEFAULT_EF_SEARCH,
+        random_seed: int = DEFAULT_RANDOM_SEED,
+    ) -> None:
+        """
+        Creates an instance of HnswIndex class.
+
+        :param extractor: The ValueExtractor to use to extract the Vector.
+        :param dimensions: The number of dimensions in the vector.
+        :param space_name: The index space name.
+        :param max_elements: The maximum number of elements the index can contain.
+        :param m: The number of bidirectional links created for every new element during construction.
+        :param ef_construction: The parameter controlling the index_time/index_accuracy.
+        :param ef_search: The parameter controlling query time/accuracy trade-off.
+        :param random_seed: The random seed used for the index.
+        """
+
+        super().__init__()
+        self.extractor = extractor
+        self.dimensions = dimensions
+        self.spaceName = space_name if space_name else ""
+        self.maxElements = max_elements
+        self.m = m
+        self.efConstruction = ef_construction
+        self.efSearch = ef_search
+        self.randomSeed = random_seed
+
+
 class Vectors:
 
     EPSILON = 1e-30  # Python automatically handles float precision
diff --git a/tests/e2e/test_ai.py b/tests/e2e/test_ai.py
@@ -8,7 +8,7 @@
 import pytest
 
 from coherence import COH_LOG, Extractors, NamedCache, Session
-from coherence.ai import BinaryQuantIndex, DocumentChunk, FloatVector, SimilaritySearch, Vectors
+from coherence.ai import BinaryQuantIndex, DocumentChunk, FloatVector, HnswIndex, SimilaritySearch, Vectors
 
 
 class ValueWithVector:
@@ -94,9 +94,49 @@ async def populate_document_chunk_vectors(vectors: NamedCache[int, DocumentChunk
 
 @pytest.mark.asyncio
 @pytest.mark.filterwarnings("ignore::pytest.PytestUnraisableExceptionWarning")
-async def test_similarity_search_with_index(test_session: Session) -> None:
+async def test_similarity_search_with_binary_quant_index(test_session: Session) -> None:
+    await _run_similarity_search_with_index(test_session, "BinaryQuantIndex")
+
+
+@pytest.mark.asyncio
+@pytest.mark.filterwarnings("ignore::pytest.PytestUnraisableExceptionWarning")
+async def test_similarity_search_with_document_chunk(test_session: Session) -> None:
+    cache: NamedCache[int, DocumentChunk] = await test_session.get_cache("vector_cache")
+    dc: DocumentChunk = await populate_document_chunk_vectors(cache)
+
+    # Create a SimilaritySearch aggregator
+    value_extractor = Extractors.extract("vector")
+    k = 10
+    ss = SimilaritySearch(value_extractor, dc.vector, k)
+
+    hnsw_result = await cache.aggregate(ss)
+
+    assert hnsw_result is not None
+    assert len(hnsw_result) == k
+    COH_LOG.info("Results below for test_SimilaritySearch_with_DocumentChunk:")
+    for e in hnsw_result:
+        COH_LOG.info(e)
+
+    await cache.truncate()
+    await cache.destroy()
+
+
+@pytest.mark.asyncio
+@pytest.mark.filterwarnings("ignore::pytest.PytestUnraisableExceptionWarning")
+async def test_similarity_search_with_hnsw_index(test_session: Session) -> None:
+    await _run_similarity_search_with_index(test_session, "HnswIndex")
+
+
+async def _run_similarity_search_with_index(test_session: Session, index_type: str) -> None:
     cache: NamedCache[int, ValueWithVector] = await test_session.get_cache("vector_cache")
-    cache.add_index(BinaryQuantIndex(Extractors.extract("vector")))
+    if index_type == "BinaryQuantIndex":
+        cache.add_index(BinaryQuantIndex(Extractors.extract("vector")))
+    elif index_type == "HnswIndex":
+        cache.add_index(HnswIndex(Extractors.extract("vector"), DIMENSIONS))
+    else:
+        COH_LOG.error("NO index_type specified")
+        return
+
     value_with_vector = await populate_vectors(cache)
 
     # Create a SimilaritySearch aggregator
@@ -122,7 +162,7 @@ async def test_similarity_search_with_index(test_session: Session) -> None:
     hnsw_result = await cache.aggregate(ss)
     end_time = time.perf_counter()
     elapsed_time = end_time - start_time
-    COH_LOG.info("Results below for test_SimilaritySearch with Index:")
+    COH_LOG.info("Results below for test_SimilaritySearch with HnswIndex:")
     for e in hnsw_result:
         COH_LOG.info(e)
     COH_LOG.info(f"Elapsed time: {elapsed_time} seconds")
@@ -132,26 +172,3 @@ async def test_similarity_search_with_index(test_session: Session) -> None:
 
     await cache.truncate()
     await cache.destroy()
-
-
-@pytest.mark.asyncio
-@pytest.mark.filterwarnings("ignore::pytest.PytestUnraisableExceptionWarning")
-async def test_similarity_search_with_document_chunk(test_session: Session) -> None:
-    cache: NamedCache[int, DocumentChunk] = await test_session.get_cache("vector_cache")
-    dc: DocumentChunk = await populate_document_chunk_vectors(cache)
-
-    # Create a SimilaritySearch aggregator
-    value_extractor = Extractors.extract("vector")
-    k = 10
-    ss = SimilaritySearch(value_extractor, dc.vector, k)
-
-    hnsw_result = await cache.aggregate(ss)
-
-    assert hnsw_result is not None
-    assert len(hnsw_result) == k
-    COH_LOG.info("Results below for test_SimilaritySearch_with_DocumentChunk:")
-    for e in hnsw_result:
-        COH_LOG.info(e)
-
-    await cache.truncate()
-    await cache.destroy()
diff --git a/tests/unit/test_serialization.py b/tests/unit/test_serialization.py
@@ -15,6 +15,7 @@
     CosineDistance,
     DocumentChunk,
     FloatVector,
+    HnswIndex,
     QueryResult,
     SimilaritySearch,
 )
@@ -253,3 +254,19 @@ def test_binary_quant_index_serialization() -> None:
 
     o = s.deserialize(ser)
     assert isinstance(o, BinaryQuantIndex)
+
+
+# noinspection PyUnresolvedReferences
+def test_HnswIndex_serialization() -> None:
+    bqi = HnswIndex(Extractors.extract("foo"), 384)
+    ser = s.serialize(bqi)
+    assert ser == (
+        b'\x15{"@class": "coherence.hnsw.HnswIndex", "dataVersion": 0, '
+        b'"binFuture": null, "extractor": {"@class": "extractor.UniversalExtractor", '
+        b'"name": "foo", "params": null}, "dimensions": 384, "spaceName": "COSINE", '
+        b'"maxElements": 4096, "m": 16, "efConstruction": 200, "efSearch": 50, '
+        b'"randomSeed": 100}'
+    )
+
+    o = s.deserialize(ser)
+    assert isinstance(o, HnswIndex)