Skip to content

Commit 0666be5

Browse files
[ML] Validate at least one feature is available for DF analytics (#55876)
We were previously checking at least one supported field existed when the _explain API was called. However, in the case of analyses with required fields (e.g. regression) we were not accounting that the dependent variable is not a feature and thus if the source index only contains the dependent variable field there are no features to train a model on. This commit adds a validation that at least one feature is available for analysis. Note that we also move that validation away from `ExtractedFieldsDetector` and the _explain API and straight into the _start API. The reason for doing this is to allow the user to use the _explain API in order to understand why they would be seeing an error like this one. For example, the user might be using an index that has fields but they are of unsupported types. If they start the job and get an error that there are no features, they will wonder why that is. Calling the _explain API will show them that all their fields are unsupported. If the _explain API was failing instead, there would be no way for the user to understand why all those fields are ignored. Closes #55593
1 parent d70cef3 commit 0666be5

File tree

5 files changed

+96
-22
lines changed

5 files changed

+96
-22
lines changed

x-pack/plugin/ml/qa/ml-with-security/build.gradle

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,8 @@ integTest.runner {
187187
'ml/preview_datafeed/Test preview missing datafeed',
188188
'ml/revert_model_snapshot/Test revert model with invalid snapshotId',
189189
'ml/start_data_frame_analytics/Test start given missing source index',
190-
'ml/start_data_frame_analytics/Test start given source index has no compatible fields',
190+
'ml/start_data_frame_analytics/Test start outlier_detection given source index has no fields',
191+
'ml/start_data_frame_analytics/Test start regression given source index only has dependent variable',
191192
'ml/start_data_frame_analytics/Test start with inconsistent body/param ids',
192193
'ml/start_data_frame_analytics/Test start given dest index is not empty',
193194
'ml/start_data_frame_analytics/Test start with compatible fields but no data',

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportStartDataFrameAnalyticsAction.java

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsConfig;
6262
import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsState;
6363
import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsTaskState;
64+
import org.elasticsearch.xpack.core.ml.dataframe.analyses.RequiredField;
6465
import org.elasticsearch.xpack.core.ml.job.messages.Messages;
6566
import org.elasticsearch.xpack.core.ml.job.persistence.AnomalyDetectorsIndex;
6667
import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
@@ -83,7 +84,9 @@
8384
import java.util.List;
8485
import java.util.Map;
8586
import java.util.Objects;
87+
import java.util.Set;
8688
import java.util.function.Predicate;
89+
import java.util.stream.Collectors;
8790

8891
import static org.elasticsearch.xpack.core.ClientHelper.ML_ORIGIN;
8992
import static org.elasticsearch.xpack.core.ClientHelper.executeAsyncWithOrigin;
@@ -244,7 +247,7 @@ private void getStartContext(String id, Task task, ActionListener<StartContext>
244247
ParentTaskAssigningClient parentTaskClient = new ParentTaskAssigningClient(client, task.getParentTaskId());
245248
// Step 7. Validate that there are analyzable data in the source index
246249
ActionListener<StartContext> validateMappingsMergeListener = ActionListener.wrap(
247-
startContext -> validateSourceIndexHasRows(startContext, finalListener),
250+
startContext -> validateSourceIndexHasAnalyzableData(startContext, finalListener),
248251
finalListener::onFailure
249252
);
250253

@@ -319,6 +322,37 @@ private void getStartContext(String id, Task task, ActionListener<StartContext>
319322
configProvider.get(id, getConfigListener);
320323
}
321324

325+
private void validateSourceIndexHasAnalyzableData(StartContext startContext, ActionListener<StartContext> listener) {
326+
ActionListener<Void> validateAtLeastOneAnalyzedFieldListener = ActionListener.wrap(
327+
aVoid -> validateSourceIndexHasRows(startContext, listener),
328+
listener::onFailure
329+
);
330+
331+
validateSourceIndexHasAtLeastOneAnalyzedField(startContext, validateAtLeastOneAnalyzedFieldListener);
332+
}
333+
334+
private void validateSourceIndexHasAtLeastOneAnalyzedField(StartContext startContext, ActionListener<Void> listener) {
335+
Set<String> requiredFields = startContext.config.getAnalysis().getRequiredFields().stream()
336+
.map(RequiredField::getName)
337+
.collect(Collectors.toSet());
338+
339+
// We assume here that required fields are not features
340+
long nonRequiredFieldsCount = startContext.extractedFields.getAllFields().stream()
341+
.filter(extractedField -> requiredFields.contains(extractedField.getName()) == false)
342+
.count();
343+
if (nonRequiredFieldsCount == 0) {
344+
StringBuilder msgBuilder = new StringBuilder("at least one field must be included in the analysis");
345+
if (requiredFields.isEmpty() == false) {
346+
msgBuilder.append(" (excluding fields ")
347+
.append(requiredFields)
348+
.append(")");
349+
}
350+
listener.onFailure(ExceptionsHelper.badRequestException(msgBuilder.toString()));
351+
} else {
352+
listener.onResponse(null);
353+
}
354+
}
355+
322356
private void validateSourceIndexHasRows(StartContext startContext, ActionListener<StartContext> listener) {
323357
DataFrameDataExtractorFactory extractorFactory = DataFrameDataExtractorFactory.createForSourceIndices(client,
324358
"validate_source_index_has_rows-" + startContext.config.getId(),

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetector.java

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -95,12 +95,6 @@ private Set<String> getIncludedFields(Set<FieldSelection> fieldSelection) {
9595
}
9696
includeAndExcludeFields(fields, fieldSelection);
9797

98-
if (fields.isEmpty()) {
99-
throw ExceptionsHelper.badRequestException("No compatible fields could be detected in index {}. Supported types are {}.",
100-
Arrays.toString(index),
101-
getSupportedTypes());
102-
}
103-
10498
return fields;
10599
}
106100

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetectorTests.java

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
import static org.hamcrest.Matchers.containsInAnyOrder;
3838
import static org.hamcrest.Matchers.equalTo;
3939
import static org.hamcrest.Matchers.hasSize;
40+
import static org.hamcrest.Matchers.is;
4041
import static org.mockito.Mockito.mock;
4142
import static org.mockito.Mockito.when;
4243

@@ -91,10 +92,14 @@ public void testDetect_GivenOutlierDetectionAndNonNumericField() {
9192

9293
ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
9394
SOURCE_INDEX, buildOutlierDetectionConfig(), 100, fieldCapabilities, Collections.emptyMap());
94-
ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect);
95+
Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
9596

96-
assertThat(e.getMessage(), equalTo("No compatible fields could be detected in index [source_index]." +
97-
" Supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short]."));
97+
assertThat(fieldExtraction.v1().getAllFields().isEmpty(), is(true));
98+
assertThat(fieldExtraction.v2().size(), equalTo(1));
99+
assertThat(fieldExtraction.v2().get(0).getName(), equalTo("some_keyword"));
100+
assertThat(fieldExtraction.v2().get(0).isIncluded(), is(false));
101+
assertThat(fieldExtraction.v2().get(0).getReason(), equalTo("unsupported type; supported types are " +
102+
"[boolean, byte, double, float, half_float, integer, long, scaled_float, short]"));
98103
}
99104

100105
public void testDetect_GivenOutlierDetectionAndFieldWithNumericAndNonNumericTypes() {
@@ -103,10 +108,14 @@ public void testDetect_GivenOutlierDetectionAndFieldWithNumericAndNonNumericType
103108

104109
ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
105110
SOURCE_INDEX, buildOutlierDetectionConfig(), 100, fieldCapabilities, Collections.emptyMap());
106-
ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect);
111+
Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
107112

108-
assertThat(e.getMessage(), equalTo("No compatible fields could be detected in index [source_index]. " +
109-
"Supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short]."));
113+
assertThat(fieldExtraction.v1().getAllFields().isEmpty(), is(true));
114+
assertThat(fieldExtraction.v2().size(), equalTo(1));
115+
assertThat(fieldExtraction.v2().get(0).getName(), equalTo("indecisive_field"));
116+
assertThat(fieldExtraction.v2().get(0).isIncluded(), is(false));
117+
assertThat(fieldExtraction.v2().get(0).getReason(), equalTo("unsupported type; supported types are " +
118+
"[boolean, byte, double, float, half_float, integer, long, scaled_float, short]"));
110119
}
111120

112121
public void testDetect_GivenOutlierDetectionAndMultipleFields() {
@@ -306,10 +315,10 @@ public void testDetect_GivenIgnoredField() {
306315

307316
ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
308317
SOURCE_INDEX, buildOutlierDetectionConfig(), 100, fieldCapabilities, Collections.emptyMap());
309-
ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect);
318+
Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
310319

311-
assertThat(e.getMessage(), equalTo("No compatible fields could be detected in index [source_index]. " +
312-
"Supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short]."));
320+
assertThat(fieldExtraction.v1().getAllFields().isEmpty(), is(true));
321+
assertThat(fieldExtraction.v2().isEmpty(), is(true));
313322
}
314323

315324
public void testDetect_GivenIncludedIgnoredField() {
@@ -410,9 +419,11 @@ public void testDetect_GivenExcludeAllValidFields() {
410419

411420
ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
412421
SOURCE_INDEX, buildOutlierDetectionConfig(), 100, fieldCapabilities, Collections.emptyMap());
413-
ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect);
414-
assertThat(e.getMessage(), equalTo("No compatible fields could be detected in index [source_index]. " +
415-
"Supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short]."));
422+
Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
423+
424+
assertThat(fieldExtraction.v1().getAllFields().isEmpty(), is(true));
425+
assertThat(fieldExtraction.v2().size(), equalTo(2));
426+
assertThat(fieldExtraction.v2().stream().filter(FieldSelection::isIncluded).findAny().isPresent(), is(false));
416427
}
417428

418429
public void testDetect_GivenInclusionsAndExclusions() {

x-pack/plugin/src/test/resources/rest-api-spec/test/ml/start_data_frame_analytics.yml

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
id: "missing_index"
3737

3838
---
39-
"Test start given source index has no compatible fields":
39+
"Test start outlier_detection given source index has no fields":
4040

4141
- do:
4242
indices.create:
@@ -57,7 +57,41 @@
5757
}
5858
5959
- do:
60-
catch: /No compatible fields could be detected in index \[empty-index\]/
60+
catch: /at least one field must be included in the analysis/
61+
ml.start_data_frame_analytics:
62+
id: "foo"
63+
64+
---
65+
"Test start regression given source index only has dependent variable":
66+
67+
- do:
68+
indices.create:
69+
index: index-with-dep-var-only
70+
body:
71+
mappings:
72+
properties:
73+
my_dep_var: { type: "long" }
74+
75+
- do:
76+
ml.put_data_frame_analytics:
77+
id: "foo"
78+
body: >
79+
{
80+
"source": {
81+
"index": "index-with-dep-var-only"
82+
},
83+
"dest": {
84+
"index": "results"
85+
},
86+
"analysis": {
87+
"regression":{
88+
"dependent_variable": "my_dep_var"
89+
}
90+
}
91+
}
92+
93+
- do:
94+
catch: /at least one field must be included in the analysis \(excluding fields \[my_dep_var\]\)/
6195
ml.start_data_frame_analytics:
6296
id: "foo"
6397

0 commit comments

Comments
 (0)