Skip to content

Vector search uses ANN index regardless of metric #5608

@wjones127

Description

@wjones127

If you build an IVF_PQ index with metric="dot", then it will be used in ANN queries even if you specify metric="l2" or metric="cosine". This gives incorrect distances.

Script
import pyarrow as pa
import pyarrow.compute as pc
import lance

nrows = 1024
ndims = 32

# Create random vectors that are scaled between 0 and 10
values = pc.random(nrows * ndims).cast(pa.float32())
values = pc.add(values, pa.scalar(10.0))
data = pa.table({"vector": pa.FixedSizeListArray.from_arrays(values, ndims)})

ds = lance.write_dataset(data, "memory://")
ds.create_index("vector", index_type="IVF_PQ", metric="dot", num_partitions=1, num_sub_vectors=8)

# We use the index regardless of requested metric type
query_vec = pc.random(ndims).cast(pa.float32()).to_pylist()
print("default explain")
print(ds.scanner(nearest=dict(column="vector", k=5, q=query_vec)).explain_plan())
print("l2 explain")
print(ds.scanner(nearest=dict(column="vector", k=5, q=query_vec, metric="l2")).explain_plan())
print("dot explain")
print(ds.scanner(nearest=dict(column="vector", k=5, q=query_vec, metric="dot")).explain_plan())
print("cosine explain")
print(ds.scanner(nearest=dict(column="vector", k=5, q=query_vec, metric="cosine")).explain_plan())

# Distances are all the same
print("Distances with index:")
l2_distances = ds.to_table(nearest=dict(column="vector", k=5, q=query_vec, metric="l2"), columns=["_distance"]).to_pylist()
print("L2 distances:", l2_distances)
dot_distances = ds.to_table(nearest=dict(column="vector", k=5, q=query_vec, metric="dot"), columns=["_distance"]).to_pylist()
print("Dot distances:", dot_distances)
cosine_distances = ds.to_table(nearest=dict(column="vector", k=5, q=query_vec, metric="cosine"), columns=["_distance"]).to_pylist()
print("Cosine distances:", cosine_distances)

# Distances seem to match dot distance calculation
print("Distances without index:")
l2_distances_no_index = ds.to_table(nearest=dict(column="vector", k=5, q=query_vec, metric="l2", use_index=False), columns=["_distance"]).to_pylist()
print("L2 distances no index:", l2_distances_no_index)
dot_distances_no_index = ds.to_table(nearest=dict(column="vector", k=5, q=query_vec, metric="dot", use_index=False), columns=["_distance"]).to_pylist()
print("Dot distances no index:", dot_distances_no_index)
cosine_distances_no_index = ds.to_table(nearest=dict(column="vector", k=5, q=query_vec, metric="cosine", use_index=False), columns=["_distance"]).to_pylist()
print("Cosine distances no index:", cosine_distances_no_index)
default explain
ProjectionExec: expr=[vector@2 as vector, _distance@0 as _distance]
  Take: columns="_distance, _rowid, (vector)"
    CoalesceBatchesExec: target_batch_size=8192
      SortExec: TopK(fetch=5), expr=[_distance@0 ASC NULLS LAST, _rowid@1 ASC NULLS LAST], preserve_partitioning=[false]
        ANNSubIndex: name=vector_idx, k=5, deltas=1
          ANNIvfPartition: uuid=d1c62f61-4f34-491e-a371-d7abcbe45a86, minimum_nprobes=1, maximum_nprobes=None, deltas=1

l2 explain
ProjectionExec: expr=[vector@2 as vector, _distance@0 as _distance]
  Take: columns="_distance, _rowid, (vector)"
    CoalesceBatchesExec: target_batch_size=8192
      SortExec: TopK(fetch=5), expr=[_distance@0 ASC NULLS LAST, _rowid@1 ASC NULLS LAST], preserve_partitioning=[false]
        ANNSubIndex: name=vector_idx, k=5, deltas=1
          ANNIvfPartition: uuid=d1c62f61-4f34-491e-a371-d7abcbe45a86, minimum_nprobes=1, maximum_nprobes=None, deltas=1

dot explain
ProjectionExec: expr=[vector@2 as vector, _distance@0 as _distance]
  Take: columns="_distance, _rowid, (vector)"
    CoalesceBatchesExec: target_batch_size=8192
      SortExec: TopK(fetch=5), expr=[_distance@0 ASC NULLS LAST, _rowid@1 ASC NULLS LAST], preserve_partitioning=[false]
        ANNSubIndex: name=vector_idx, k=5, deltas=1
          ANNIvfPartition: uuid=d1c62f61-4f34-491e-a371-d7abcbe45a86, minimum_nprobes=1, maximum_nprobes=None, deltas=1

cosine explain
ProjectionExec: expr=[vector@2 as vector, _distance@0 as _distance]
  Take: columns="_distance, _rowid, (vector)"
    CoalesceBatchesExec: target_batch_size=8192
      SortExec: TopK(fetch=5), expr=[_distance@0 ASC NULLS LAST, _rowid@1 ASC NULLS LAST], preserve_partitioning=[false]
        ANNSubIndex: name=vector_idx, k=5, deltas=1
          ANNIvfPartition: uuid=d1c62f61-4f34-491e-a371-d7abcbe45a86, minimum_nprobes=1, maximum_nprobes=None, deltas=1

Distances with index:
L2 distances: [{'_distance': -156.54623413085938}, {'_distance': -156.29393005371094}, {'_distance': -156.27366638183594}, {'_distance': -156.1887969970703}, {'_distance': -156.06460571289062}]
Dot distances: [{'_distance': -156.54623413085938}, {'_distance': -156.29393005371094}, {'_distance': -156.27366638183594}, {'_distance': -156.1887969970703}, {'_distance': -156.06460571289062}]
Cosine distances: [{'_distance': -156.54623413085938}, {'_distance': -156.29393005371094}, {'_distance': -156.27366638183594}, {'_distance': -156.1887969970703}, {'_distance': -156.06460571289062}]
Distances without index:
L2 distances no index: [{'_distance': 3132.699951171875}, {'_distance': 3136.7939453125}, {'_distance': 3139.87548828125}, {'_distance': 3140.111572265625}, {'_distance': 3140.428955078125}]
Dot distances no index: [{'_distance': -156.77696228027344}, {'_distance': -156.3072509765625}, {'_distance': -156.26429748535156}, {'_distance': -156.1993865966797}, {'_distance': -156.03543090820312}]
Cosine distances no index: [{'_distance': 0.13222384452819824}, {'_distance': 0.1324082612991333}, {'_distance': 0.1326027512550354}, {'_distance': 0.13267850875854492}, {'_distance': 0.1328442096710205}]

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't workingcritical-fixBugs that cause crashes, security vulnerabilities, or incorrect data.

    Type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions