|
7 | 7 | import base64
|
8 | 8 | from abc import ABC
|
9 | 9 | from collections import OrderedDict
|
10 |
| -from typing import Any, Dict, List, Optional, TypeVar, Union, cast |
| 10 | +from typing import Any, Dict, Final, List, Optional, TypeVar, Union, cast |
11 | 11 |
|
12 | 12 | import jsonpickle
|
13 | 13 | import numpy as np
|
@@ -337,6 +337,93 @@ def __init__(self, extractor: Union[ValueExtractor[T, E], str], over_sampling_fa
|
337 | 337 | self.oversamplingFactor = over_sampling_factor
|
338 | 338 |
|
339 | 339 |
|
| 340 | +@proxy("coherence.hnsw.HnswIndex") |
| 341 | +class HnswIndex(AbstractEvolvable): |
| 342 | + DEFAULT_SPACE_NAME: Final[str] = "COSINE" |
| 343 | + """The default index space name.""" |
| 344 | + |
| 345 | + DEFAULT_MAX_ELEMENTS: Final[int] = 4096 |
| 346 | + """ |
| 347 | + The default maximum number of elements the index can contain is 4096 |
| 348 | + but the index will grow automatically by doubling its capacity until it |
| 349 | + reaches approximately 8m elements, at which point it will grow by 50% |
| 350 | + whenever it gets full. |
| 351 | + """ |
| 352 | + |
| 353 | + DEFAULT_M: Final[int] = 16 |
| 354 | + """ |
| 355 | + The default number of bidirectional links created for every new |
| 356 | + element during construction is 2-100. Higher M work better on datasets |
| 357 | + with high intrinsic dimensionality and/or high recall, while low M work |
| 358 | + better for datasets with low intrinsic dimensionality and/or low recalls. |
| 359 | + The parameter also determines the algorithm's memory consumption, |
| 360 | + which is roughly M * 8-10 bytes per stored element. As an example for |
| 361 | + dim=4 random vectors optimal M for search is somewhere around 6, |
| 362 | + while for high dimensional datasets (word embeddings, good face |
| 363 | + descriptors), higher M are required (e.g. M=48-64) for optimal |
| 364 | + performance at high recall. The range M=12-48 is ok for the most of the |
| 365 | + use cases. When M is changed one has to update the other parameters. |
| 366 | + Nonetheless, ef and ef_construction parameters can be roughly estimated |
| 367 | + by assuming that M*ef_{construction} is a constant. The default value is |
| 368 | + 16. |
| 369 | + """ |
| 370 | + |
| 371 | + DEFAULT_EF_CONSTRUCTION: Final[int] = 200 |
| 372 | + """ |
| 373 | + The parameter has the same meaning as ef, which controls the |
| 374 | + index_time/index_accuracy. Bigger ef_construction leads to longer |
| 375 | + construction, but better index quality. At some point, increasing |
| 376 | + ef_construction does not improve the quality of the index. One way to |
| 377 | + check if the selection of ef_construction was ok is to measure a recall |
| 378 | + for M nearest neighbor search when ef =ef_construction: if the recall is |
| 379 | + lower than 0.9, than there is room for improvement. The default value is |
| 380 | + 200. |
| 381 | + """ |
| 382 | + |
| 383 | + DEFAULT_EF_SEARCH: Final[int] = 50 |
| 384 | + """ |
| 385 | + The parameter controlling query time/accuracy trade-off. The default |
| 386 | + value is 50. |
| 387 | + """ |
| 388 | + |
| 389 | + DEFAULT_RANDOM_SEED: Final[int] = 100 |
| 390 | + """The default random seed used for the index.""" |
| 391 | + |
| 392 | + def __init__( |
| 393 | + self, |
| 394 | + extractor: Union[ValueExtractor[T, E], str], |
| 395 | + dimensions: int, |
| 396 | + space_name: str = DEFAULT_SPACE_NAME, |
| 397 | + max_elements: int = DEFAULT_MAX_ELEMENTS, |
| 398 | + m: int = DEFAULT_M, |
| 399 | + ef_construction: int = DEFAULT_EF_CONSTRUCTION, |
| 400 | + ef_search: int = DEFAULT_EF_SEARCH, |
| 401 | + random_seed: int = DEFAULT_RANDOM_SEED, |
| 402 | + ) -> None: |
| 403 | + """ |
| 404 | + Creates an instance of HnswIndex class. |
| 405 | +
|
| 406 | + :param extractor: The ValueExtractor to use to extract the Vector. |
| 407 | + :param dimensions: The number of dimensions in the vector. |
| 408 | + :param space_name: The index space name. |
| 409 | + :param max_elements: The maximum number of elements the index can contain. |
| 410 | + :param m: The number of bidirectional links created for every new element during construction. |
| 411 | + :param ef_construction: The parameter controlling the index_time/index_accuracy. |
| 412 | + :param ef_search: The parameter controlling query time/accuracy trade-off. |
| 413 | + :param random_seed: The random seed used for the index. |
| 414 | + """ |
| 415 | + |
| 416 | + super().__init__() |
| 417 | + self.extractor = extractor |
| 418 | + self.dimensions = dimensions |
| 419 | + self.spaceName = space_name if space_name else "" |
| 420 | + self.maxElements = max_elements |
| 421 | + self.m = m |
| 422 | + self.efConstruction = ef_construction |
| 423 | + self.efSearch = ef_search |
| 424 | + self.randomSeed = random_seed |
| 425 | + |
| 426 | + |
340 | 427 | class Vectors:
|
341 | 428 |
|
342 | 429 | EPSILON = 1e-30 # Python automatically handles float precision
|
|
0 commit comments