Skip to content

COH-32073 - Add support for HnswIndex in ai module #232

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Apr 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 88 additions & 1 deletion src/coherence/ai.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import base64
from abc import ABC
from collections import OrderedDict
from typing import Any, Dict, List, Optional, TypeVar, Union, cast
from typing import Any, Dict, Final, List, Optional, TypeVar, Union, cast

import jsonpickle
import numpy as np
Expand Down Expand Up @@ -337,6 +337,93 @@ def __init__(self, extractor: Union[ValueExtractor[T, E], str], over_sampling_fa
self.oversamplingFactor = over_sampling_factor


@proxy("coherence.hnsw.HnswIndex")
class HnswIndex(AbstractEvolvable):
DEFAULT_SPACE_NAME: Final[str] = "COSINE"
"""The default index space name."""

DEFAULT_MAX_ELEMENTS: Final[int] = 4096
"""
The default maximum number of elements the index can contain is 4096
but the index will grow automatically by doubling its capacity until it
reaches approximately 8m elements, at which point it will grow by 50%
whenever it gets full.
"""

DEFAULT_M: Final[int] = 16
"""
The default number of bidirectional links created for every new
element during construction is 2-100. Higher M work better on datasets
with high intrinsic dimensionality and/or high recall, while low M work
better for datasets with low intrinsic dimensionality and/or low recalls.
The parameter also determines the algorithm's memory consumption,
which is roughly M * 8-10 bytes per stored element. As an example for
dim=4 random vectors optimal M for search is somewhere around 6,
while for high dimensional datasets (word embeddings, good face
descriptors), higher M are required (e.g. M=48-64) for optimal
performance at high recall. The range M=12-48 is ok for the most of the
use cases. When M is changed one has to update the other parameters.
Nonetheless, ef and ef_construction parameters can be roughly estimated
by assuming that M*ef_{construction} is a constant. The default value is
16.
"""

DEFAULT_EF_CONSTRUCTION: Final[int] = 200
"""
The parameter has the same meaning as ef, which controls the
index_time/index_accuracy. Bigger ef_construction leads to longer
construction, but better index quality. At some point, increasing
ef_construction does not improve the quality of the index. One way to
check if the selection of ef_construction was ok is to measure a recall
for M nearest neighbor search when ef =ef_construction: if the recall is
lower than 0.9, than there is room for improvement. The default value is
200.
"""

DEFAULT_EF_SEARCH: Final[int] = 50
"""
The parameter controlling query time/accuracy trade-off. The default
value is 50.
"""

DEFAULT_RANDOM_SEED: Final[int] = 100
"""The default random seed used for the index."""

def __init__(
self,
extractor: Union[ValueExtractor[T, E], str],
dimensions: int,
space_name: str = DEFAULT_SPACE_NAME,
max_elements: int = DEFAULT_MAX_ELEMENTS,
m: int = DEFAULT_M,
ef_construction: int = DEFAULT_EF_CONSTRUCTION,
ef_search: int = DEFAULT_EF_SEARCH,
random_seed: int = DEFAULT_RANDOM_SEED,
) -> None:
"""
Creates an instance of HnswIndex class.

:param extractor: The ValueExtractor to use to extract the Vector.
:param dimensions: The number of dimensions in the vector.
:param space_name: The index space name.
:param max_elements: The maximum number of elements the index can contain.
:param m: The number of bidirectional links created for every new element during construction.
:param ef_construction: The parameter controlling the index_time/index_accuracy.
:param ef_search: The parameter controlling query time/accuracy trade-off.
:param random_seed: The random seed used for the index.
"""

super().__init__()
self.extractor = extractor
self.dimensions = dimensions
self.spaceName = space_name if space_name else ""
self.maxElements = max_elements
self.m = m
self.efConstruction = ef_construction
self.efSearch = ef_search
self.randomSeed = random_seed


class Vectors:

EPSILON = 1e-30 # Python automatically handles float precision
Expand Down
71 changes: 44 additions & 27 deletions tests/e2e/test_ai.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import pytest

from coherence import COH_LOG, Extractors, NamedCache, Session
from coherence.ai import BinaryQuantIndex, DocumentChunk, FloatVector, SimilaritySearch, Vectors
from coherence.ai import BinaryQuantIndex, DocumentChunk, FloatVector, HnswIndex, SimilaritySearch, Vectors


class ValueWithVector:
Expand Down Expand Up @@ -94,9 +94,49 @@ async def populate_document_chunk_vectors(vectors: NamedCache[int, DocumentChunk

@pytest.mark.asyncio
@pytest.mark.filterwarnings("ignore::pytest.PytestUnraisableExceptionWarning")
async def test_similarity_search_with_index(test_session: Session) -> None:
async def test_similarity_search_with_binary_quant_index(test_session: Session) -> None:
await _run_similarity_search_with_index(test_session, "BinaryQuantIndex")


@pytest.mark.asyncio
@pytest.mark.filterwarnings("ignore::pytest.PytestUnraisableExceptionWarning")
async def test_similarity_search_with_document_chunk(test_session: Session) -> None:
cache: NamedCache[int, DocumentChunk] = await test_session.get_cache("vector_cache")
dc: DocumentChunk = await populate_document_chunk_vectors(cache)

# Create a SimilaritySearch aggregator
value_extractor = Extractors.extract("vector")
k = 10
ss = SimilaritySearch(value_extractor, dc.vector, k)

hnsw_result = await cache.aggregate(ss)

assert hnsw_result is not None
assert len(hnsw_result) == k
COH_LOG.info("Results below for test_SimilaritySearch_with_DocumentChunk:")
for e in hnsw_result:
COH_LOG.info(e)

await cache.truncate()
await cache.destroy()


@pytest.mark.asyncio
@pytest.mark.filterwarnings("ignore::pytest.PytestUnraisableExceptionWarning")
async def test_similarity_search_with_hnsw_index(test_session: Session) -> None:
await _run_similarity_search_with_index(test_session, "HnswIndex")


async def _run_similarity_search_with_index(test_session: Session, index_type: str) -> None:
cache: NamedCache[int, ValueWithVector] = await test_session.get_cache("vector_cache")
cache.add_index(BinaryQuantIndex(Extractors.extract("vector")))
if index_type == "BinaryQuantIndex":
cache.add_index(BinaryQuantIndex(Extractors.extract("vector")))
elif index_type == "HnswIndex":
cache.add_index(HnswIndex(Extractors.extract("vector"), DIMENSIONS))
else:
COH_LOG.error("NO index_type specified")
return

value_with_vector = await populate_vectors(cache)

# Create a SimilaritySearch aggregator
Expand All @@ -122,7 +162,7 @@ async def test_similarity_search_with_index(test_session: Session) -> None:
hnsw_result = await cache.aggregate(ss)
end_time = time.perf_counter()
elapsed_time = end_time - start_time
COH_LOG.info("Results below for test_SimilaritySearch with Index:")
COH_LOG.info("Results below for test_SimilaritySearch with HnswIndex:")
for e in hnsw_result:
COH_LOG.info(e)
COH_LOG.info(f"Elapsed time: {elapsed_time} seconds")
Expand All @@ -132,26 +172,3 @@ async def test_similarity_search_with_index(test_session: Session) -> None:

await cache.truncate()
await cache.destroy()


@pytest.mark.asyncio
@pytest.mark.filterwarnings("ignore::pytest.PytestUnraisableExceptionWarning")
async def test_similarity_search_with_document_chunk(test_session: Session) -> None:
cache: NamedCache[int, DocumentChunk] = await test_session.get_cache("vector_cache")
dc: DocumentChunk = await populate_document_chunk_vectors(cache)

# Create a SimilaritySearch aggregator
value_extractor = Extractors.extract("vector")
k = 10
ss = SimilaritySearch(value_extractor, dc.vector, k)

hnsw_result = await cache.aggregate(ss)

assert hnsw_result is not None
assert len(hnsw_result) == k
COH_LOG.info("Results below for test_SimilaritySearch_with_DocumentChunk:")
for e in hnsw_result:
COH_LOG.info(e)

await cache.truncate()
await cache.destroy()
17 changes: 17 additions & 0 deletions tests/unit/test_serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
CosineDistance,
DocumentChunk,
FloatVector,
HnswIndex,
QueryResult,
SimilaritySearch,
)
Expand Down Expand Up @@ -253,3 +254,19 @@ def test_binary_quant_index_serialization() -> None:

o = s.deserialize(ser)
assert isinstance(o, BinaryQuantIndex)


# noinspection PyUnresolvedReferences
def test_HnswIndex_serialization() -> None:
bqi = HnswIndex(Extractors.extract("foo"), 384)
ser = s.serialize(bqi)
assert ser == (
b'\x15{"@class": "coherence.hnsw.HnswIndex", "dataVersion": 0, '
b'"binFuture": null, "extractor": {"@class": "extractor.UniversalExtractor", '
b'"name": "foo", "params": null}, "dimensions": 384, "spaceName": "COSINE", '
b'"maxElements": 4096, "m": 16, "efConstruction": 200, "efSearch": 50, '
b'"randomSeed": 100}'
)

o = s.deserialize(ser)
assert isinstance(o, HnswIndex)