Skip to content

Commit 3df0f70

Browse files
Merge branch 'main' of github.com:elastic/elasticsearch into custom-inference-service
2 parents a8c5241 + 23b7a31 commit 3df0f70

File tree

154 files changed

+6555
-968
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

154 files changed

+6555
-968
lines changed

BUILDING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ To wire this registered cluster into a `TestClusterAware` task (e.g. `RestIntegT
144144
Additional integration tests for a certain Elasticsearch modules that are specific to certain cluster configuration can be declared in a separate so called `qa` subproject of your module.
145145

146146
The benefit of a dedicated project for these tests are:
147-
- `qa` projects are dedicated two specific use-cases and easier to maintain
147+
- `qa` projects are dedicated to specific use-cases and easier to maintain
148148
- It keeps the specific test logic separated from the common test logic.
149149
- You can run those tests in parallel to other projects of the build.
150150

Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the "Elastic License
4+
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
5+
* Public License v 1"; you may not use this file except in compliance with, at
6+
* your election, the "Elastic License 2.0", the "GNU Affero General Public
7+
* License v3.0 only", or the "Server Side Public License, v 1".
8+
*/
9+
package org.elasticsearch.benchmark.vector;
10+
11+
import org.apache.lucene.index.VectorSimilarityFunction;
12+
import org.apache.lucene.store.Directory;
13+
import org.apache.lucene.store.IOContext;
14+
import org.apache.lucene.store.IndexInput;
15+
import org.apache.lucene.store.IndexOutput;
16+
import org.apache.lucene.store.MMapDirectory;
17+
import org.apache.lucene.util.VectorUtil;
18+
import org.apache.lucene.util.quantization.OptimizedScalarQuantizer;
19+
import org.elasticsearch.common.logging.LogConfigurator;
20+
import org.elasticsearch.simdvec.internal.vectorization.ES91OSQVectorsScorer;
21+
import org.elasticsearch.simdvec.internal.vectorization.ESVectorizationProvider;
22+
import org.openjdk.jmh.annotations.Benchmark;
23+
import org.openjdk.jmh.annotations.BenchmarkMode;
24+
import org.openjdk.jmh.annotations.Fork;
25+
import org.openjdk.jmh.annotations.Measurement;
26+
import org.openjdk.jmh.annotations.Mode;
27+
import org.openjdk.jmh.annotations.OutputTimeUnit;
28+
import org.openjdk.jmh.annotations.Param;
29+
import org.openjdk.jmh.annotations.Scope;
30+
import org.openjdk.jmh.annotations.Setup;
31+
import org.openjdk.jmh.annotations.State;
32+
import org.openjdk.jmh.annotations.Warmup;
33+
import org.openjdk.jmh.infra.Blackhole;
34+
35+
import java.io.IOException;
36+
import java.nio.file.Files;
37+
import java.util.Random;
38+
import java.util.concurrent.TimeUnit;
39+
40+
@BenchmarkMode(Mode.Throughput)
41+
@OutputTimeUnit(TimeUnit.MILLISECONDS)
42+
@State(Scope.Benchmark)
43+
// first iteration is complete garbage, so make sure we really warmup
44+
@Warmup(iterations = 4, time = 1)
45+
// real iterations. not useful to spend tons of time here, better to fork more
46+
@Measurement(iterations = 5, time = 1)
47+
// engage some noise reduction
48+
@Fork(value = 1)
49+
public class OSQScorerBenchmark {
50+
51+
static {
52+
LogConfigurator.configureESLogging(); // native access requires logging to be initialized
53+
}
54+
55+
@Param({ "1024" })
56+
int dims;
57+
58+
int length;
59+
60+
int numVectors = ES91OSQVectorsScorer.BULK_SIZE * 10;
61+
int numQueries = 10;
62+
63+
byte[][] binaryVectors;
64+
byte[][] binaryQueries;
65+
OptimizedScalarQuantizer.QuantizationResult result;
66+
float centroidDp;
67+
68+
byte[] scratch;
69+
ES91OSQVectorsScorer scorer;
70+
71+
IndexInput in;
72+
73+
float[] scratchScores;
74+
float[] corrections;
75+
76+
@Setup
77+
public void setup() throws IOException {
78+
Random random = new Random(123);
79+
80+
this.length = OptimizedScalarQuantizer.discretize(dims, 64) / 8;
81+
82+
binaryVectors = new byte[numVectors][length];
83+
for (byte[] binaryVector : binaryVectors) {
84+
random.nextBytes(binaryVector);
85+
}
86+
87+
Directory dir = new MMapDirectory(Files.createTempDirectory("vectorData"));
88+
IndexOutput out = dir.createOutput("vectors", IOContext.DEFAULT);
89+
byte[] correctionBytes = new byte[14 * ES91OSQVectorsScorer.BULK_SIZE];
90+
for (int i = 0; i < numVectors; i += ES91OSQVectorsScorer.BULK_SIZE) {
91+
for (int j = 0; j < ES91OSQVectorsScorer.BULK_SIZE; j++) {
92+
out.writeBytes(binaryVectors[i + j], 0, binaryVectors[i + j].length);
93+
}
94+
random.nextBytes(correctionBytes);
95+
out.writeBytes(correctionBytes, 0, correctionBytes.length);
96+
}
97+
out.close();
98+
in = dir.openInput("vectors", IOContext.DEFAULT);
99+
100+
binaryQueries = new byte[numVectors][4 * length];
101+
for (byte[] binaryVector : binaryVectors) {
102+
random.nextBytes(binaryVector);
103+
}
104+
result = new OptimizedScalarQuantizer.QuantizationResult(
105+
random.nextFloat(),
106+
random.nextFloat(),
107+
random.nextFloat(),
108+
Short.toUnsignedInt((short) random.nextInt())
109+
);
110+
centroidDp = random.nextFloat();
111+
112+
scratch = new byte[length];
113+
scorer = ESVectorizationProvider.getInstance().newES91OSQVectorsScorer(in, dims);
114+
scratchScores = new float[16];
115+
corrections = new float[3];
116+
}
117+
118+
@Benchmark
119+
@Fork(jvmArgsPrepend = { "--add-modules=jdk.incubator.vector" })
120+
public void scoreFromArray(Blackhole bh) throws IOException {
121+
for (int j = 0; j < numQueries; j++) {
122+
in.seek(0);
123+
for (int i = 0; i < numVectors; i++) {
124+
in.readBytes(scratch, 0, length);
125+
float qDist = VectorUtil.int4BitDotProduct(binaryQueries[j], scratch);
126+
in.readFloats(corrections, 0, corrections.length);
127+
int addition = Short.toUnsignedInt(in.readShort());
128+
float score = scorer.score(
129+
result,
130+
VectorSimilarityFunction.EUCLIDEAN,
131+
centroidDp,
132+
corrections[0],
133+
corrections[1],
134+
addition,
135+
corrections[2],
136+
qDist
137+
);
138+
bh.consume(score);
139+
}
140+
}
141+
}
142+
143+
@Benchmark
144+
@Fork(jvmArgsPrepend = { "--add-modules=jdk.incubator.vector" })
145+
public void scoreFromMemorySegmentOnlyVector(Blackhole bh) throws IOException {
146+
for (int j = 0; j < numQueries; j++) {
147+
in.seek(0);
148+
for (int i = 0; i < numVectors; i++) {
149+
float qDist = scorer.quantizeScore(binaryQueries[j]);
150+
in.readFloats(corrections, 0, corrections.length);
151+
int addition = Short.toUnsignedInt(in.readShort());
152+
float score = scorer.score(
153+
result,
154+
VectorSimilarityFunction.EUCLIDEAN,
155+
centroidDp,
156+
corrections[0],
157+
corrections[1],
158+
addition,
159+
corrections[2],
160+
qDist
161+
);
162+
bh.consume(score);
163+
}
164+
}
165+
}
166+
167+
@Benchmark
168+
@Fork(jvmArgsPrepend = { "--add-modules=jdk.incubator.vector" })
169+
public void scoreFromMemorySegmentOnlyVectorBulk(Blackhole bh) throws IOException {
170+
for (int j = 0; j < numQueries; j++) {
171+
in.seek(0);
172+
for (int i = 0; i < numVectors; i += 16) {
173+
scorer.quantizeScoreBulk(binaryQueries[j], ES91OSQVectorsScorer.BULK_SIZE, scratchScores);
174+
for (int k = 0; k < ES91OSQVectorsScorer.BULK_SIZE; k++) {
175+
in.readFloats(corrections, 0, corrections.length);
176+
int addition = Short.toUnsignedInt(in.readShort());
177+
float score = scorer.score(
178+
result,
179+
VectorSimilarityFunction.EUCLIDEAN,
180+
centroidDp,
181+
corrections[0],
182+
corrections[1],
183+
addition,
184+
corrections[2],
185+
scratchScores[k]
186+
);
187+
bh.consume(score);
188+
}
189+
}
190+
}
191+
}
192+
193+
@Benchmark
194+
@Fork(jvmArgsPrepend = { "--add-modules=jdk.incubator.vector" })
195+
public void scoreFromMemorySegmentAllBulk(Blackhole bh) throws IOException {
196+
for (int j = 0; j < numQueries; j++) {
197+
in.seek(0);
198+
for (int i = 0; i < numVectors; i += 16) {
199+
scorer.scoreBulk(binaryQueries[j], result, VectorSimilarityFunction.EUCLIDEAN, centroidDp, scratchScores);
200+
bh.consume(scratchScores);
201+
}
202+
}
203+
}
204+
}

docs/changelog/124708.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 124708
2+
summary: Throw exception for unknown token in RestIndexPutAliasAction
3+
area: Indices APIs
4+
type: enhancement
5+
issues: []

docs/changelog/124737.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 124737
2+
summary: Throw exception for unsupported values type in Alias
3+
area: Indices APIs
4+
type: enhancement
5+
issues: []

docs/changelog/125922.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 125922
2+
summary: Fix text structure NPE when fields in list have null value
3+
area: Machine Learning
4+
type: bug
5+
issues: []

docs/changelog/126286.yaml

Lines changed: 0 additions & 6 deletions
This file was deleted.

docs/changelog/126629.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 126629
2+
summary: Default new `semantic_text` fields to use BBQ when models are compatible
3+
area: Relevance
4+
type: enhancement
5+
issues: []

docs/changelog/127134.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 127134
2+
summary: Define a default oversample value for dense vectors with bbq_hnsw/bbq_flat
3+
area: Vector Search
4+
type: enhancement
5+
issues: []

docs/changelog/127139.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 127139
2+
summary: Add `suggested_cast`
3+
area: ES|QL
4+
type: enhancement
5+
issues: []

docs/changelog/127229.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 127229
2+
summary: Return BAD_REQUEST when a field scorer references a missing field
3+
area: Ranking
4+
type: bug
5+
issues:
6+
- 127162

docs/changelog/127285.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 127285
2+
summary: Restore model registry validation for the semantic text field
3+
area: Search
4+
type: enhancement
5+
issues: []

docs/changelog/127351.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
pr: 127351
2+
summary: Allow partial results by default in ES|QL
3+
area: ES|QL
4+
type: breaking
5+
issues: [122802]
6+
7+
breaking:
8+
title: Allow partial results by default in ES|QL
9+
area: ES|QL
10+
details: >-
11+
In earlier versions of {es}, ES|QL would fail the entire query if it encountered any error. ES|QL now returns partial results instead of failing when encountering errors.
12+
13+
impact: >-
14+
Callers should check the `is_partial` flag returned in the response to determine if the result is partial or complete. If returning partial results is not desired, this option can be overridden per request via an `allow_partial_results` parameter in the query URL or globally via the cluster setting `esql.query.allow_partial_results`.
15+
16+
notable: true

docs/changelog/127414.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 127414
2+
summary: Fix npe when using source confirmed text query against missing field
3+
area: Search
4+
type: bug
5+
issues: []

docs/reference/enrich-processor/date-processor.md

Lines changed: 63 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ mapped_pages:
66

77
# Date processor [date-processor]
88

9-
109
Parses dates from fields, and then uses the date or timestamp as the timestamp for the document. By default, the date processor adds the parsed date as a new field called `@timestamp`. You can specify a different field by setting the `target_field` configuration parameter. Multiple date formats are supported as part of the same date processor definition. They will be used sequentially to attempt parsing the date field, in the same order they were defined as part of the processor definition.
1110

1211
$$$date-options$$$
@@ -16,7 +15,7 @@ $$$date-options$$$
1615
| `field` | yes | - | The field to get the date from. |
1716
| `target_field` | no | @timestamp | The field that will hold the parsed date. |
1817
| `formats` | yes | - | An array of the expected date formats. Can be a [java time pattern](/reference/elasticsearch/mapping-reference/mapping-date-format.md) or one of the following formats: ISO8601, UNIX, UNIX_MS, or TAI64N. |
19-
| `timezone` | no | UTC | The timezone to use when parsing the date. Supports [template snippets](docs-content://manage-data/ingest/transform-enrich/ingest-pipelines.md#template-snippets). |
18+
| `timezone` | no | UTC | The default [timezone](#date-processor-timezones) used by the processor. Supports [template snippets](docs-content://manage-data/ingest/transform-enrich/ingest-pipelines.md#template-snippets). |
2019
| `locale` | no | ENGLISH | The locale to use when parsing the date, relevant when parsing month names or week days. Supports [template snippets](docs-content://manage-data/ingest/transform-enrich/ingest-pipelines.md#template-snippets). |
2120
| `output_format` | no | `yyyy-MM-dd'T'HH:mm:ss.SSSXXX` | The format to use when writing the date to `target_field`. Must be a valid [java time pattern](/reference/elasticsearch/mapping-reference/mapping-date-format.md). |
2221
| `description` | no | - | Description of the processor. Useful for describing the purpose of the processor or its configuration. |
@@ -25,6 +24,20 @@ $$$date-options$$$
2524
| `on_failure` | no | - | Handle failures for the processor. See [Handling pipeline failures](docs-content://manage-data/ingest/transform-enrich/ingest-pipelines.md#handling-pipeline-failures). |
2625
| `tag` | no | - | Identifier for the processor. Useful for debugging and metrics. |
2726

27+
## Timezones [date-processor-timezones]
28+
29+
The `timezone` option may have two effects on the behavior of the processor:
30+
- If the string being parsed matches a format representing a local date-time, such as `yyyy-MM-dd HH:mm:ss`, it will be assumed to be in the timezone specified by this option. This is not applicable if the string matches a format representing a zoned date-time, such as `yyyy-MM-dd HH:mm:ss zzz`: in that case, the timezone parsed from the string will be used. It is also not applicable if the string matches an absolute time format, such as `epoch_millis`.
31+
- The date-time will be converted into the timezone given by this option before it is formatted and written into the target field. This is not applicable if the `output_format` is an absolute time format such as `epoch_millis`.
32+
33+
::::{warning}
34+
We recommend avoiding the use of short abbreviations for timezone names, since they can be ambiguous. For example, one JDK might interpret `PST` as `America/Tijuana`, i.e. Pacific (Standard) Time, while another JDK might interpret it as `Asia/Manila`, i.e. Philippine Standard Time. If your input data contains such abbreviations, you should convert them into either standard full names or UTC offsets before parsing them, using your own knowledge of what each abbreviation means in your data. See [below](#date-processor-short-timezone-example) for an example. (This does not apply to `UTC`, which is safe.)
35+
::::
36+
37+
## Examples [date-processor-examples]
38+
39+
### Simple example [date-processor-simple-example]
40+
2841
Here is an example that adds the parsed date to the `timestamp` field based on the `initial_date` field:
2942

3043
```js
@@ -43,6 +56,8 @@ Here is an example that adds the parsed date to the `timestamp` field based on t
4356
}
4457
```
4558

59+
### Example using templated parameters [date-processor-templated-example]
60+
4661
The `timezone` and `locale` processor parameters are templated. This means that their values can be extracted from fields within documents. The example below shows how to extract the locale/timezone details from existing fields, `my_timezone` and `my_locale`, in the ingested document that contain the timezone and locale values.
4762

4863
```js
@@ -62,3 +77,49 @@ The `timezone` and `locale` processor parameters are templated. This means that
6277
}
6378
```
6479

80+
### Example dealing with short timezone abbreviations safely [date-processor-short-timezone-example]
81+
82+
In the example below, the `message` field in the input is expected to be a string formed of a local date-time in `yyyyMMddHHmmss` format, a timezone abbreviated to one of `PST`, `CET`, or `JST` representing Pacific, Central European, or Japan time, and a payload. This field is split up using a `grok` processor, then the timezones are converted into full names using a `script` processor, then the date-time is parsed using a `date` processor, and finally the unwanted fields are discarded using a `drop` processor.
83+
84+
```js
85+
{
86+
"description" : "...",
87+
"processors": [
88+
{
89+
"grok": {
90+
"field": "message",
91+
"patterns": ["%{DATESTAMP_EVENTLOG:local_date_time} %{TZ:short_tz} %{GREEDYDATA:payload}"],
92+
"pattern_definitions": {
93+
"TZ": "[A-Z]{3}"
94+
}
95+
}
96+
},
97+
{
98+
"script": {
99+
"source": "ctx['full_tz'] = params['tz_map'][ctx['short_tz']]",
100+
"params": {
101+
"tz_map": {
102+
"PST": "America/Los_Angeles",
103+
"CET": "Europe/Amsterdam",
104+
"JST": "Asia/Tokyo"
105+
}
106+
}
107+
}
108+
},
109+
{
110+
"date": {
111+
"field": "local_date_time",
112+
"formats": ["yyyyMMddHHmmss"],
113+
"timezone": "{{{full_tz}}}"
114+
}
115+
},
116+
{
117+
"remove": {
118+
"field": ["message", "local_date_time", "short_tz", "full_tz"]
119+
}
120+
}
121+
]
122+
}
123+
```
124+
125+
With this pipeline, a `message` field with the value `20250102123456 PST Hello world` will result in a `@timestamp` field with the value `2025-01-02T12:34:56.000-08:00` and a `payload` field with the value `Hello world`. (Note: A `@timestamp` field will normally be mapped to a `date` type, and therefore it will be indexed as an integer representing milliseconds since the epoch, although the original format and timezone may be preserved in the `_source`.)

0 commit comments

Comments
 (0)