Skip to content

Increment inference stats counter for shard bulk inference calls #129140

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jun 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/129140.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 129140
summary: Increment inference stats counter for shard bulk inference calls
area: Machine Learning
type: enhancement
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -344,22 +344,24 @@ public Collection<?> createComponents(PluginServices services) {
}
inferenceServiceRegistry.set(serviceRegistry);

var meterRegistry = services.telemetryProvider().getMeterRegistry();
var inferenceStats = InferenceStats.create(meterRegistry);
var inferenceStatsBinding = new PluginComponentBinding<>(InferenceStats.class, inferenceStats);

var actionFilter = new ShardBulkInferenceActionFilter(
services.clusterService(),
serviceRegistry,
modelRegistry.get(),
getLicenseState(),
services.indexingPressure()
services.indexingPressure(),
inferenceStats
);
shardBulkInferenceActionFilter.set(actionFilter);

var meterRegistry = services.telemetryProvider().getMeterRegistry();
var inferenceStats = new PluginComponentBinding<>(InferenceStats.class, InferenceStats.create(meterRegistry));

components.add(serviceRegistry);
components.add(modelRegistry.get());
components.add(httpClientManager);
components.add(inferenceStats);
components.add(inferenceStatsBinding);

// Only add InferenceServiceNodeLocalRateLimitCalculator (which is a ClusterStateListener) for cluster aware rate limiting,
// if the rate limiting feature flags are enabled, otherwise provide noop implementation
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
import org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper;
import org.elasticsearch.xpack.inference.mapper.SemanticTextUtils;
import org.elasticsearch.xpack.inference.registry.ModelRegistry;
import org.elasticsearch.xpack.inference.telemetry.InferenceStats;

import java.io.IOException;
import java.util.ArrayList;
Expand All @@ -78,6 +79,8 @@
import static org.elasticsearch.xpack.inference.InferencePlugin.INFERENCE_API_FEATURE;
import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.toSemanticTextFieldChunks;
import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.toSemanticTextFieldChunksLegacy;
import static org.elasticsearch.xpack.inference.telemetry.InferenceStats.modelAttributes;
import static org.elasticsearch.xpack.inference.telemetry.InferenceStats.responseAttributes;

/**
* A {@link MappedActionFilter} that intercepts {@link BulkShardRequest} to apply inference on fields specified
Expand Down Expand Up @@ -112,20 +115,23 @@ public class ShardBulkInferenceActionFilter implements MappedActionFilter {
private final ModelRegistry modelRegistry;
private final XPackLicenseState licenseState;
private final IndexingPressure indexingPressure;
private final InferenceStats inferenceStats;
private volatile long batchSizeInBytes;

public ShardBulkInferenceActionFilter(
ClusterService clusterService,
InferenceServiceRegistry inferenceServiceRegistry,
ModelRegistry modelRegistry,
XPackLicenseState licenseState,
IndexingPressure indexingPressure
IndexingPressure indexingPressure,
InferenceStats inferenceStats
) {
this.clusterService = clusterService;
this.inferenceServiceRegistry = inferenceServiceRegistry;
this.modelRegistry = modelRegistry;
this.licenseState = licenseState;
this.indexingPressure = indexingPressure;
this.inferenceStats = inferenceStats;
this.batchSizeInBytes = INDICES_INFERENCE_BATCH_SIZE.get(clusterService.getSettings()).getBytes();
clusterService.getClusterSettings().addSettingsUpdateConsumer(INDICES_INFERENCE_BATCH_SIZE, this::setBatchSize);
}
Expand Down Expand Up @@ -386,10 +392,12 @@ public void onFailure(Exception exc) {
public void onResponse(List<ChunkedInference> results) {
try (onFinish) {
var requestsIterator = requests.iterator();
int success = 0;
for (ChunkedInference result : results) {
var request = requestsIterator.next();
var acc = inferenceResults.get(request.bulkItemIndex);
if (result instanceof ChunkedInferenceError error) {
recordRequestCountMetrics(inferenceProvider.model, 1, error.exception());
acc.addFailure(
new InferenceException(
"Exception when running inference id [{}] on field [{}]",
Expand All @@ -399,6 +407,7 @@ public void onResponse(List<ChunkedInference> results) {
)
);
} else {
success++;
acc.addOrUpdateResponse(
new FieldInferenceResponse(
request.field(),
Expand All @@ -412,12 +421,16 @@ public void onResponse(List<ChunkedInference> results) {
);
}
}
if (success > 0) {
recordRequestCountMetrics(inferenceProvider.model, success, null);
}
}
}

@Override
public void onFailure(Exception exc) {
try (onFinish) {
recordRequestCountMetrics(inferenceProvider.model, requests.size(), exc);
for (FieldInferenceRequest request : requests) {
addInferenceResponseFailure(
request.bulkItemIndex,
Expand All @@ -444,6 +457,14 @@ public void onFailure(Exception exc) {
);
}

private void recordRequestCountMetrics(Model model, int incrementBy, Throwable throwable) {
Map<String, Object> requestCountAttributes = new HashMap<>();
requestCountAttributes.putAll(modelAttributes(model));
requestCountAttributes.putAll(responseAttributes(throwable));
requestCountAttributes.put("inference_source", "semantic_text_bulk");
inferenceStats.requestCount().incrementBy(incrementBy, requestCountAttributes);
}

/**
* Adds all inference requests associated with their respective inference IDs to the given {@code requestsMap}
* for the specified {@code item}.
Expand Down
Loading
Loading