Skip to content

Use query in cardinality check #49939

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.support.WriteRequest;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.xpack.core.ml.action.EvaluateDataFrameAction;
import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsConfig;
Expand Down Expand Up @@ -228,7 +230,7 @@ public void testWithOnlyTrainingRowsAndTrainingPercentIsFifty_DependentVariableI
assertEvaluation(BOOLEAN_FIELD, BOOLEAN_FIELD_VALUES, "ml.boolean-field_prediction");
}

public void testDependentVariableCardinalityTooHighError() {
public void testDependentVariableCardinalityTooHighError() throws Exception {
initialize("cardinality_too_high");
indexData(sourceIndex, 6, 5, KEYWORD_FIELD);
// Index one more document with a class different than the two already used.
Expand All @@ -246,6 +248,27 @@ public void testDependentVariableCardinalityTooHighError() {
assertThat(e.getMessage(), equalTo("Field [keyword-field] must have at most [2] distinct values but there were at least [3]"));
}

public void testDependentVariableCardinalityTooHighButWithQueryMakesItWithinRange() throws Exception {
initialize("cardinality_too_high_with_query");
indexData(sourceIndex, 6, 5, KEYWORD_FIELD);
// Index one more document with a class different than the two already used.
client().execute(IndexAction.INSTANCE, new IndexRequest(sourceIndex)
.source(KEYWORD_FIELD, "fox")
.setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE))
.actionGet();
QueryBuilder query = QueryBuilders.boolQuery().filter(QueryBuilders.termsQuery(KEYWORD_FIELD, KEYWORD_FIELD_VALUES));

DataFrameAnalyticsConfig config = buildAnalytics(jobId, sourceIndex, destIndex, null, new Classification(KEYWORD_FIELD), query);
registerAnalytics(config);
putAnalytics(config);

// Should not throw
startAnalytics(jobId);
waitUntilAnalyticsIsStopped(jobId);

assertProgress(jobId, 100, 100, 100, 100);
}

private void initialize(String jobId) {
this.jobId = jobId;
this.sourceIndex = jobId + "_source_index";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.sort.SortOrder;
import org.elasticsearch.xpack.core.ml.action.DeleteDataFrameAnalyticsAction;
Expand All @@ -37,6 +38,7 @@
import org.elasticsearch.xpack.core.ml.job.persistence.AnomalyDetectorsIndex;
import org.elasticsearch.xpack.core.ml.notifications.AuditorField;
import org.elasticsearch.xpack.core.ml.utils.PhaseProgress;
import org.elasticsearch.xpack.core.ml.utils.QueryProvider;
import org.elasticsearch.xpack.ml.dataframe.DataFrameAnalyticsTask;
import org.hamcrest.Matcher;
import org.hamcrest.Matchers;
Expand Down Expand Up @@ -161,10 +163,16 @@ protected EvaluateDataFrameAction.Response evaluateDataFrame(String index, Evalu
}

protected static DataFrameAnalyticsConfig buildAnalytics(String id, String sourceIndex, String destIndex,
@Nullable String resultsField, DataFrameAnalysis analysis) {
@Nullable String resultsField, DataFrameAnalysis analysis) throws Exception {
return buildAnalytics(id, sourceIndex, destIndex, resultsField, analysis, QueryBuilders.matchAllQuery());
}

protected static DataFrameAnalyticsConfig buildAnalytics(String id, String sourceIndex, String destIndex,
@Nullable String resultsField, DataFrameAnalysis analysis,
QueryBuilder queryBuilder) throws Exception {
return new DataFrameAnalyticsConfig.Builder()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The other method called buildAnalytics (starting in line 165) should call this one, as the new method is more generic.

.setId(id)
.setSource(new DataFrameAnalyticsSource(new String[] { sourceIndex }, null, null))
.setSource(new DataFrameAnalyticsSource(new String[] { sourceIndex }, QueryProvider.fromParsedQuery(queryBuilder), null))
.setDest(new DataFrameAnalyticsDest(destIndex, resultsField))
.setAnalysis(analysis)
.build();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ private void getCardinalitiesForFieldsWithLimit(String[] index, DataFrameAnalyti
listener::onFailure
);

SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder().size(0);
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder().size(0).query(config.getSource().getParsedQuery());
for (Map.Entry<String, Long> entry : fieldCardinalityLimits.entrySet()) {
String fieldName = entry.getKey();
Long limit = entry.getValue();
Expand Down