Skip to content

Extend dense_vector to support indexing vectors #78491

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Oct 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,10 @@ public enum LuceneFilesExtensions {
TVF("tvf", "Term Vector Fields", false, false),
TVM("tvm", "Term Vector Metadata", true, false),
TVX("tvx", "Term Vector Index", false, false),
// kNN vectors format
VEC("vec", "Vector Data", false, false),
// Lucene 9.0 indexed vectors metadata
VEM("vem","Vector Metadata", true, false);
VEX("vex", "Vector Index", false, false),
VEM("vem", "Vector Metadata", true, false);

/**
* Allow plugin developers of custom codecs to opt out of the assertion in {@link #fromExtension}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,29 +12,37 @@ setup:
number_of_replicas: 0
mappings:
properties:
my_dense_vector:
vector:
type: dense_vector
dims: 5
indexed_vector:
type: dense_vector
dims: 5
index: true
similarity: dot_product
- do:
index:
index: test-index
id: 1
body:
my_dense_vector: [230.0, 300.33, -34.8988, 15.555, -200.0]
vector: [230.0, 300.33, -34.8988, 15.555, -200.0]
indexed_vector: [230.0, 300.33, -34.8988, 15.555, -200.0]

- do:
index:
index: test-index
id: 2
body:
my_dense_vector: [-0.5, 100.0, -13, 14.8, -156.0]
vector: [-0.5, 100.0, -13, 14.8, -156.0]
indexed_vector: [-0.5, 100.0, -13, 14.8, -156.0]

- do:
index:
index: test-index
id: 3
body:
my_dense_vector: [0.5, 111.3, -13.0, 14.8, -156.0]
vector: [0.5, 111.3, -13.0, 14.8, -156.0]
indexed_vector: [0.5, 111.3, -13.0, 14.8, -156.0]

- do:
indices.refresh: {}
Expand All @@ -51,7 +59,7 @@ setup:
script_score:
query: {match_all: {} }
script:
source: "dotProduct(params.query_vector, 'my_dense_vector')"
source: "dotProduct(params.query_vector, 'vector')"
params:
query_vector: [0.5, 111.3, -13.0, 14.8, -156.0]

Expand Down Expand Up @@ -81,7 +89,37 @@ setup:
script_score:
query: {match_all: {} }
script:
source: "cosineSimilarity(params.query_vector, 'my_dense_vector')"
source: "cosineSimilarity(params.query_vector, 'vector')"
params:
query_vector: [0.5, 111.3, -13.0, 14.8, -156.0]

- match: {hits.total: 3}

- match: {hits.hits.0._id: "3"}
- gte: {hits.hits.0._score: 0.999}
- lte: {hits.hits.0._score: 1.001}

- match: {hits.hits.1._id: "2"}
- gte: {hits.hits.1._score: 0.998}
- lte: {hits.hits.1._score: 1.0}

- match: {hits.hits.2._id: "1"}
- gte: {hits.hits.2._score: 0.78}
- lte: {hits.hits.2._score: 0.791}

---
"Cosine similarity with indexed vector":
- do:
headers:
Content-Type: application/json
search:
rest_total_hits_as_int: true
body:
query:
script_score:
query: {match_all: {} }
script:
source: "cosineSimilarity(params.query_vector, 'indexed_vector')"
params:
query_vector: [0.5, 111.3, -13.0, 14.8, -156.0]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ setup:
- skip:
features: headers
version: " - 7.3.99"
reason: "dense_vector functions check on empty values was added from 7.4"
reason: "vector functions check on empty values was added from 7.4"

- do:
indices.create:
Expand All @@ -13,10 +13,14 @@ setup:
number_of_shards: 1
mappings:
properties:
my_dense_vector:
type: dense_vector
dims: 3

vector:
type: dense_vector
dims: 3
indexed_vector:
type: dense_vector
dims: 3
index: true
similarity: l2_norm

---
"Indexing of Dense vectors should error when dims don't match defined in the mapping":
Expand All @@ -27,7 +31,16 @@ setup:
index: test-index
id: 1
body:
my_dense_vector: [10, 2]
vector: [10, 2]
- match: { error.type: "mapper_parsing_exception" }

- do:
catch: bad_request
index:
index: test-index
id: 1
body:
indexed_vector: [10, 2]
- match: { error.type: "mapper_parsing_exception" }

---
Expand All @@ -37,14 +50,14 @@ setup:
index: test-index
id: 1
body:
my_dense_vector: [10, 10, 10]
vector: [10, 10, 10]

- do:
index:
index: test-index
id: 2
body:
my_dense_vector: [10.5, 10.9, 10.4]
vector: [10.5, 10.9, 10.4]

- do:
indices.refresh: {}
Expand All @@ -61,7 +74,7 @@ setup:
script_score:
query: {match_all: {} }
script:
source: "cosineSimilarity(params.query_vector, 'my_dense_vector')"
source: "cosineSimilarity(params.query_vector, 'vector')"
params:
query_vector: [10, 10, 10]

Expand All @@ -81,7 +94,7 @@ setup:
script_score:
query: {match_all: {} }
script:
source: "cosineSimilarity(params.query_vector, 'my_dense_vector')"
source: "cosineSimilarity(params.query_vector, 'vector')"
params:
query_vector: [10.0, 10.0, 10.0]

Expand All @@ -97,7 +110,7 @@ setup:
index: test-index
id: 1
body:
my_dense_vector: [1, 2, 3]
vector: [1, 2, 3]

- do:
indices.refresh: {}
Expand All @@ -112,7 +125,7 @@ setup:
script_score:
query: {match_all: {} }
script:
source: "cosineSimilarity(params.query_vector, 'my_dense_vector')"
source: "cosineSimilarity(params.query_vector, 'vector')"
params:
query_vector: [1, 2, 3, 4]
- match: { error.root_cause.0.type: "script_exception" }
Expand All @@ -127,7 +140,7 @@ setup:
script_score:
query: {match_all: {} }
script:
source: "dotProduct(params.query_vector, 'my_dense_vector')"
source: "dotProduct(params.query_vector, 'vector')"
params:
query_vector: [1, 2, 3, 4]
- match: { error.root_cause.0.type: "script_exception" }
Expand All @@ -139,7 +152,7 @@ setup:
index: test-index
id: 1
body:
my_dense_vector: [10, 10, 10]
vector: [10, 10, 10]

- do:
index:
Expand All @@ -164,7 +177,24 @@ setup:
script_score:
query: {match_all: {} }
script:
source: "cosineSimilarity(params.query_vector, 'my_dense_vector')"
source: "cosineSimilarity(params.query_vector, 'vector')"
params:
query_vector: [10.0, 10.0, 10.0]
- match: { error.root_cause.0.type: "script_exception" }

- do:
catch: bad_request
headers:
Content-Type: application/json
search:
rest_total_hits_as_int: true
index: test-index
body:
query:
script_score:
query: {match_all: {} }
script:
source: "cosineSimilarity(params.query_vector, 'indexed_vector')"
params:
query_vector: [10.0, 10.0, 10.0]
- match: { error.root_cause.0.type: "script_exception" }
Expand All @@ -181,7 +211,27 @@ setup:
script_score:
query: {match_all: {} }
script:
source: "doc['my_dense_vector'].size() == 0 ? 0 : cosineSimilarity(params.query_vector, 'my_dense_vector')"
source: "doc['vector'].size() == 0 ? 0 : cosineSimilarity(params.query_vector, 'vector')"
params:
query_vector: [10.0, 10.0, 10.0]

- match: {hits.total: 2}
- match: {hits.hits.0._id: "1"}
- match: {hits.hits.1._id: "2"}
- match: {hits.hits.1._score: 0.0}

- do:
headers:
Content-Type: application/json
search:
rest_total_hits_as_int: true
index: test-index
body:
query:
script_score:
query: {match_all: {} }
script:
source: "doc['indexed_vector'].size() == 0 ? 0 : cosineSimilarity(params.query_vector, 'indexed_vector')"
params:
query_vector: [10.0, 10.0, 10.0]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,71 @@
- match: { hits.hits.1._score: 1 }
- match: { hits.hits.2._id: "1" }
- match: { hits.hits.2._score: 0 }

---
"Access to values of indexed dense_vector in script":
- skip:
version: " - 7.12.99"
reason: "Access to values of dense_vector in script was added in 7.13"
- do:
indices.create:
index: test-index
body:
mappings:
properties:
v:
type: dense_vector
dims: 3
index: true
similarity: dot_product

- do:
bulk:
index: test-index
refresh: true
body:
- '{"index": {"_id": "1"}}'
- '{"v": [1, 1, 1]}'
- '{"index": {"_id": "2"}}'
- '{"v": [1, 1, 2]}'
- '{"index": {"_id": "3"}}'
- '{"v": [1, 1, 3]}'
- '{"index": {"_id": "missing_vector"}}'
- '{}'

# vector functions in loop – return the index of the closest parameter vector based on cosine similarity
- do:
search:
body:
query:
script_score:
query: { "exists": { "field": "v" } }
script:
source: |
float[] v = doc['v'].vectorValue;
float vm = doc['v'].magnitude;

int closestPv = 0;
float maxCosSim = -1;
for (int i = 0; i < params.pvs.length; i++) {
float dotProduct = 0;
for (int j = 0; j < v.length; j++) {
dotProduct += v[j] * params.pvs[i][j];
}
float cosSim = dotProduct / (vm * (float) params.pvs_magnts[i]);
if (maxCosSim < cosSim) {
maxCosSim = cosSim;
closestPv = i;
}
}
closestPv;
params:
pvs: [ [ 1, 1, 1 ], [ 1, 1, 2 ], [ 1, 1, 3 ] ]
pvs_magnts: [1.7320, 2.4495, 3.3166]

- match: { hits.hits.0._id: "3" }
- match: { hits.hits.0._score: 2 }
- match: { hits.hits.1._id: "2" }
- match: { hits.hits.1._score: 1 }
- match: { hits.hits.2._id: "1" }
- match: { hits.hits.2._score: 0 }
Loading