Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/changelog/91917.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 91917
summary: ML stats failures should not stop the usage API working
area: Machine Learning
type: bug
issues:
- 91893
1 change: 1 addition & 0 deletions x-pack/plugin/ml/qa/ml-with-security/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ tasks.named("yamlRestTest").configure {
'ml/jobs_get_result_overall_buckets/Test overall buckets given invalid start param',
'ml/jobs_get_result_overall_buckets/Test overall buckets given invalid end param',
'ml/jobs_get_result_overall_buckets/Test overall buckets given bucket_span is smaller than max job bucket_span',
'ml/jobs_get_stats/Test closed results index',
'ml/jobs_get_stats/Test get job stats given missing job',
'ml/jobs_get_stats/Test no exception on get job stats with missing index',
'ml/job_groups/Test put job with empty group',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
package org.elasticsearch.xpack.ml;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.util.Constants;
import org.apache.lucene.util.Counter;
import org.elasticsearch.ElasticsearchException;
Expand Down Expand Up @@ -73,6 +74,8 @@

public class MachineLearningFeatureSet implements XPackFeatureSet {

private static final Logger logger = LogManager.getLogger(MachineLearningFeatureSet.class);

/**
* List of platforms for which the native processes are available
*/
Expand Down Expand Up @@ -368,58 +371,88 @@ public void execute(ActionListener<Usage> listener) {
nodeCount
);
listener.onResponse(usage);
}, listener::onFailure);
}, e -> {
logger.warn("Failed to get trained models usage to include in ML usage", e);
MachineLearningFeatureSetUsage usage = new MachineLearningFeatureSetUsage(
available,
enabled,
jobsUsage,
datafeedsUsage,
analyticsUsage,
inferenceUsage,
nodeCount
);
listener.onResponse(usage);
});

// Step 5. Extract usage from ingest statistics and gather trained model config count
GetTrainedModelsAction.Request getModelsRequest = new GetTrainedModelsAction.Request(
"*",
Collections.emptyList(),
Collections.emptySet()
);
getModelsRequest.setPageParams(new PageParams(0, 10_000));
ActionListener<NodesStatsResponse> nodesStatsListener = ActionListener.wrap(response -> {
addInferenceIngestUsage(response, inferenceUsage);
GetTrainedModelsAction.Request getModelsRequest = new GetTrainedModelsAction.Request(
"*",
Collections.emptyList(),
Collections.emptySet()
);
getModelsRequest.setPageParams(new PageParams(0, 10_000));
client.execute(GetTrainedModelsAction.INSTANCE, getModelsRequest, trainedModelsListener);
}, listener::onFailure);
}, e -> {
logger.warn("Failed to get inference ingest usage to include in ML usage", e);
client.execute(GetTrainedModelsAction.INSTANCE, getModelsRequest, trainedModelsListener);
});

// Step 4. Extract usage from data frame analytics configs and then request ingest node stats
String[] ingestNodes = ingestNodes(state);
NodesStatsRequest nodesStatsRequest = new NodesStatsRequest(ingestNodes).clear()
.addMetric(NodesStatsRequest.Metric.INGEST.metricName());
ActionListener<GetDataFrameAnalyticsAction.Response> dataframeAnalyticsListener = ActionListener.wrap(response -> {
addDataFrameAnalyticsUsage(response, analyticsUsage);
String[] ingestNodes = ingestNodes(state);
NodesStatsRequest nodesStatsRequest = new NodesStatsRequest(ingestNodes).clear()
.addMetric(NodesStatsRequest.Metric.INGEST.metricName());
client.execute(NodesStatsAction.INSTANCE, nodesStatsRequest, nodesStatsListener);
}, listener::onFailure);
}, e -> {
logger.warn("Failed to get data frame analytics configs to include in ML usage", e);
client.execute(NodesStatsAction.INSTANCE, nodesStatsRequest, nodesStatsListener);
});

// Step 3. Extract usage from data frame analytics stats and then request data frame analytics configs
GetDataFrameAnalyticsAction.Request getDfaRequest = new GetDataFrameAnalyticsAction.Request(Metadata.ALL);
getDfaRequest.setPageParams(new PageParams(0, 10_000));
ActionListener<GetDataFrameAnalyticsStatsAction.Response> dataframeAnalyticsStatsListener = ActionListener.wrap(response -> {
addDataFrameAnalyticsStatsUsage(response, analyticsUsage);
GetDataFrameAnalyticsAction.Request getDfaRequest = new GetDataFrameAnalyticsAction.Request(Metadata.ALL);
getDfaRequest.setPageParams(new PageParams(0, 10_000));
client.execute(GetDataFrameAnalyticsAction.INSTANCE, getDfaRequest, dataframeAnalyticsListener);
}, listener::onFailure);
}, e -> {
logger.warn("Failed to get data frame analytics stats to include in ML usage", e);
client.execute(GetDataFrameAnalyticsAction.INSTANCE, getDfaRequest, dataframeAnalyticsListener);
});

// Step 2. Extract usage from datafeeds stats and return usage response
GetDataFrameAnalyticsStatsAction.Request dataframeAnalyticsStatsRequest = new GetDataFrameAnalyticsStatsAction.Request(
GetDatafeedsStatsAction.ALL
);
dataframeAnalyticsStatsRequest.setPageParams(new PageParams(0, 10_000));
ActionListener<GetDatafeedsStatsAction.Response> datafeedStatsListener = ActionListener.wrap(response -> {
addDatafeedsUsage(response);
GetDataFrameAnalyticsStatsAction.Request dataframeAnalyticsStatsRequest = new GetDataFrameAnalyticsStatsAction.Request(
GetDatafeedsStatsAction.ALL
);
dataframeAnalyticsStatsRequest.setPageParams(new PageParams(0, 10_000));
client.execute(GetDataFrameAnalyticsStatsAction.INSTANCE, dataframeAnalyticsStatsRequest, dataframeAnalyticsStatsListener);
}, listener::onFailure);
}, e -> {
logger.warn("Failed to get datafeed stats to include in ML usage", e);
client.execute(GetDataFrameAnalyticsStatsAction.INSTANCE, dataframeAnalyticsStatsRequest, dataframeAnalyticsStatsListener);
});

// Step 1. Extract usage from jobs stats and then request stats for all datafeeds
GetJobsStatsAction.Request jobStatsRequest = new GetJobsStatsAction.Request(Metadata.ALL);
GetDatafeedsStatsAction.Request datafeedStatsRequest = new GetDatafeedsStatsAction.Request(GetDatafeedsStatsAction.ALL);
ActionListener<GetJobsStatsAction.Response> jobStatsListener = ActionListener.wrap(response -> {
jobManagerHolder.getJobManager().expandJobs(Metadata.ALL, true, ActionListener.wrap(jobs -> {
addJobsUsage(response, jobs.results());
GetDatafeedsStatsAction.Request datafeedStatsRequest = new GetDatafeedsStatsAction.Request(GetDatafeedsStatsAction.ALL);
client.execute(GetDatafeedsStatsAction.INSTANCE, datafeedStatsRequest, datafeedStatsListener);
}, listener::onFailure));
}, listener::onFailure);
}, e -> {
logger.warn("Failed to get job configs to include in ML usage", e);
client.execute(GetDatafeedsStatsAction.INSTANCE, datafeedStatsRequest, datafeedStatsListener);
}));
}, e -> {
logger.warn("Failed to get job stats to include in ML usage", e);
client.execute(GetDatafeedsStatsAction.INSTANCE, datafeedStatsRequest, datafeedStatsListener);
});

// Step 0. Kick off the chain of callbacks by requesting jobs stats
GetJobsStatsAction.Request jobStatsRequest = new GetJobsStatsAction.Request(Metadata.ALL);
client.execute(GetJobsStatsAction.INSTANCE, jobStatsRequest, jobStatsListener);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -413,3 +413,58 @@ setup:
- is_false: jobs.1.timing_stats.maximum_bucket_processing_time_ms
- is_false: jobs.1.timing_stats.average_bucket_processing_time_ms
- is_false: jobs.1.timing_stats.exponential_average_bucket_processing_time_ms

---
"Test closed results index":

- skip:
features:
- "warnings"

- do:
warnings:
- 'Posting data directly to anomaly detection jobs is deprecated, in a future major version it will be compulsory to use a datafeed'
ml.post_data:
job_id: job-stats-test
body: >
{"airline":"AAL","responsetime":"132.2046","time":"1403481600"}
{"airline":"JZA","responsetime":"990.4628","time":"1403481600"}

- do:
ml.close_job:
job_id: jobs-get-stats-datafeed-job
- match: { closed: true }

- do:
ml.close_job:
job_id: job-stats-test
- match: { closed: true }

- do:
ml.get_job_stats: {}
- length: { jobs : 2 }

- do:
xpack.usage: {}
- match: { ml.available: true }
- match: { ml.enabled: true }
- match: { ml.jobs.closed.count: 2 }

- do:
indices.close:
index: .ml-anomalies-shared
wait_for_active_shards: index-setting

# With the index closed the low level ML API reports a problem
- do:
catch: /type=cluster_block_exception, reason=index \[.ml-anomalies-shared\] blocked by. \[FORBIDDEN\/.\/index closed\]/
ml.get_job_stats: {}

# But the high level X-Pack API returns what it can - we do this
# so that corruption to ML doesn't blind observers of the general
# cluster status
- do:
xpack.usage: {}
- match: { ml.available: true }
- match: { ml.enabled: true }
- is_false: ml.jobs.closed.count