Skip to content

Commit

Permalink
HLRC support for string_stats (#52163) (#52297)
Browse files Browse the repository at this point in the history
This adds a builder and parsed results for the `string_stats`
aggregation directly to the high level rest client. Without this the
HLRC can't access the `string_stats` API without the elastic licensed
`analytics` module.

While I'm in there this adds a few of our usual unit tests and
modernizes the parsing.
  • Loading branch information
nik9000 authored Feb 13, 2020
1 parent 12e378b commit 2dac36d
Show file tree
Hide file tree
Showing 17 changed files with 727 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@
import org.elasticsearch.action.support.master.AcknowledgedResponse;
import org.elasticsearch.action.update.UpdateRequest;
import org.elasticsearch.action.update.UpdateResponse;
import org.elasticsearch.client.analytics.ParsedStringStats;
import org.elasticsearch.client.analytics.StringStatsAggregationBuilder;
import org.elasticsearch.client.core.CountRequest;
import org.elasticsearch.client.core.CountResponse;
import org.elasticsearch.client.core.GetSourceRequest;
Expand Down Expand Up @@ -1926,6 +1928,7 @@ static List<NamedXContentRegistry.Entry> getDefaultNamedXContents() {
map.put(IpRangeAggregationBuilder.NAME, (p, c) -> ParsedBinaryRange.fromXContent(p, (String) c));
map.put(TopHitsAggregationBuilder.NAME, (p, c) -> ParsedTopHits.fromXContent(p, (String) c));
map.put(CompositeAggregationBuilder.NAME, (p, c) -> ParsedComposite.fromXContent(p, (String) c));
map.put(StringStatsAggregationBuilder.NAME, (p, c) -> ParsedStringStats.PARSER.parse(p, (String) c));
List<NamedXContentRegistry.Entry> entries = map.entrySet().stream()
.map(entry -> new NamedXContentRegistry.Entry(Aggregation.class, new ParseField(entry.getKey()), entry.getValue()))
.collect(Collectors.toList());
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.client.analytics;

import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.xcontent.ConstructingObjectParser;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.search.aggregations.ParsedAggregation;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import static java.util.Collections.unmodifiableMap;
import static org.elasticsearch.common.xcontent.ConstructingObjectParser.constructorArg;
import static org.elasticsearch.common.xcontent.ConstructingObjectParser.optionalConstructorArg;

/**
* Results from the {@code string_stats} aggregation.
*/
public class ParsedStringStats extends ParsedAggregation {
private static final ParseField COUNT_FIELD = new ParseField("count");
private static final ParseField MIN_LENGTH_FIELD = new ParseField("min_length");
private static final ParseField MAX_LENGTH_FIELD = new ParseField("max_length");
private static final ParseField AVG_LENGTH_FIELD = new ParseField("avg_length");
private static final ParseField ENTROPY_FIELD = new ParseField("entropy");
private static final ParseField DISTRIBUTION_FIELD = new ParseField("distribution");

private final long count;
private final int minLength;
private final int maxLength;
private final double avgLength;
private final double entropy;
private final boolean showDistribution;
private final Map<String, Double> distribution;

private ParsedStringStats(String name, long count, int minLength, int maxLength, double avgLength, double entropy,
boolean showDistribution, Map<String, Double> distribution) {
setName(name);
this.count = count;
this.minLength = minLength;
this.maxLength = maxLength;
this.avgLength = avgLength;
this.entropy = entropy;
this.showDistribution = showDistribution;
this.distribution = distribution;
}

/**
* The number of non-empty fields counted.
*/
public long getCount() {
return count;
}

/**
* The length of the shortest term.
*/
public int getMinLength() {
return minLength;
}

/**
* The length of the longest term.
*/
public int getMaxLength() {
return maxLength;
}

/**
* The average length computed over all terms.
*/
public double getAvgLength() {
return avgLength;
}

/**
* The <a href="https://en.wikipedia.org/wiki/Entropy_(information_theory)">Shannon Entropy</a>
* value computed over all terms collected by the aggregation.
* Shannon entropy quantifies the amount of information contained in
* the field. It is a very useful metric for measuring a wide range of
* properties of a data set, such as diversity, similarity,
* randomness etc.
*/
public double getEntropy() {
return entropy;
}

/**
* The probability distribution for all characters. {@code null} unless
* explicitly requested with {@link StringStatsAggregationBuilder#showDistribution(boolean)}.
*/
public Map<String, Double> getDistribution() {
return distribution;
}

@Override
public String getType() {
return StringStatsAggregationBuilder.NAME;
}

private static final Object NULL_DISTRIBUTION_MARKER = new Object();
public static final ConstructingObjectParser<ParsedStringStats, String> PARSER = new ConstructingObjectParser<>(
StringStatsAggregationBuilder.NAME, true, (args, name) -> {
long count = (long) args[0];
boolean disributionWasExplicitNull = args[5] == NULL_DISTRIBUTION_MARKER;
if (count == 0) {
return new ParsedStringStats(name, count, 0, 0, 0, 0, disributionWasExplicitNull, null);
}
int minLength = (int) args[1];
int maxLength = (int) args[2];
double averageLength = (double) args[3];
double entropy = (double) args[4];
if (disributionWasExplicitNull) {
return new ParsedStringStats(name, count, minLength, maxLength, averageLength, entropy,
disributionWasExplicitNull, null);
} else {
@SuppressWarnings("unchecked")
Map<String, Double> distribution = (Map<String, Double>) args[5];
return new ParsedStringStats(name, count, minLength, maxLength, averageLength, entropy,
distribution != null, distribution);
}
});
static {
PARSER.declareLong(constructorArg(), COUNT_FIELD);
PARSER.declareIntOrNull(constructorArg(), 0, MIN_LENGTH_FIELD);
PARSER.declareIntOrNull(constructorArg(), 0, MAX_LENGTH_FIELD);
PARSER.declareDoubleOrNull(constructorArg(), 0, AVG_LENGTH_FIELD);
PARSER.declareDoubleOrNull(constructorArg(), 0, ENTROPY_FIELD);
PARSER.declareObjectOrNull(optionalConstructorArg(), (p, c) -> unmodifiableMap(p.map(HashMap::new, XContentParser::doubleValue)),
NULL_DISTRIBUTION_MARKER, DISTRIBUTION_FIELD);
ParsedAggregation.declareAggregationFields(PARSER);
}

@Override
protected XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException {
builder.field(COUNT_FIELD.getPreferredName(), count);
if (count == 0) {
builder.nullField(MIN_LENGTH_FIELD.getPreferredName());
builder.nullField(MAX_LENGTH_FIELD.getPreferredName());
builder.nullField(AVG_LENGTH_FIELD.getPreferredName());
builder.field(ENTROPY_FIELD.getPreferredName(), 0.0);
} else {
builder.field(MIN_LENGTH_FIELD.getPreferredName(), minLength);
builder.field(MAX_LENGTH_FIELD.getPreferredName(), maxLength);
builder.field(AVG_LENGTH_FIELD.getPreferredName(), avgLength);
builder.field(ENTROPY_FIELD.getPreferredName(), entropy);
}
if (showDistribution) {
builder.field(DISTRIBUTION_FIELD.getPreferredName(), distribution);
}
return builder;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.client.analytics;

import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.io.stream.Writeable;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.index.query.QueryRewriteContext;
import org.elasticsearch.index.query.QueryShardContext;
import org.elasticsearch.search.aggregations.AbstractAggregationBuilder;
import org.elasticsearch.search.aggregations.AggregationBuilder;
import org.elasticsearch.search.aggregations.AggregatorFactories.Builder;
import org.elasticsearch.search.aggregations.AggregatorFactory;
import org.elasticsearch.search.aggregations.support.CoreValuesSourceType;
import org.elasticsearch.search.aggregations.support.ValueType;
import org.elasticsearch.search.aggregations.support.ValuesSource;
import org.elasticsearch.search.aggregations.support.ValuesSource.Bytes;
import org.elasticsearch.search.aggregations.support.ValuesSourceAggregationBuilder;
import org.elasticsearch.search.aggregations.support.ValuesSourceAggregatorFactory;
import org.elasticsearch.search.aggregations.support.ValuesSourceConfig;
import org.elasticsearch.search.builder.SearchSourceBuilder;

import java.io.IOException;
import java.util.Map;
import java.util.Objects;

/**
* Builds the {@code string_stats} aggregation request.
* <p>
* NOTE: This extends {@linkplain AbstractAggregationBuilder} for compatibility
* with {@link SearchSourceBuilder#aggregation(AggregationBuilder)} but it
* doesn't support any "server" side things like
* {@linkplain Writeable#writeTo(StreamOutput)},
* {@linkplain AggregationBuilder#rewrite(QueryRewriteContext)}, or
* {@linkplain AbstractAggregationBuilder#build(QueryShardContext, AggregatorFactory)}.
*/
public class StringStatsAggregationBuilder extends ValuesSourceAggregationBuilder<ValuesSource.Bytes, StringStatsAggregationBuilder> {
public static final String NAME = "string_stats";
private static final ParseField SHOW_DISTRIBUTION_FIELD = new ParseField("show_distribution");

private boolean showDistribution = false;

public StringStatsAggregationBuilder(String name) {
super(name, CoreValuesSourceType.BYTES, ValueType.STRING);
}

/**
* Compute the distribution of each character. Disabled by default.
* @return this for chaining
*/
public StringStatsAggregationBuilder showDistribution(boolean showDistribution) {
this.showDistribution = showDistribution;
return this;
}

@Override
public String getType() {
return NAME;
}

@Override
public XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException {
return builder.field(StringStatsAggregationBuilder.SHOW_DISTRIBUTION_FIELD.getPreferredName(), showDistribution);
}

@Override
protected void innerWriteTo(StreamOutput out) throws IOException {
throw new UnsupportedOperationException();
}

@Override
protected ValuesSourceAggregatorFactory<Bytes> innerBuild(QueryShardContext queryShardContext, ValuesSourceConfig<Bytes> config,
AggregatorFactory parent, Builder subFactoriesBuilder) throws IOException {
throw new UnsupportedOperationException();
}

@Override
protected AggregationBuilder shallowCopy(Builder factoriesBuilder, Map<String, Object> metaData) {
throw new UnsupportedOperationException();
}

@Override
public int hashCode() {
return Objects.hash(super.hashCode(), showDistribution);
}

@Override
public boolean equals(Object obj) {
if (obj == null || getClass() != obj.getClass()) {
return false;
}
if (false == super.equals(obj)) {
return false;
}
StringStatsAggregationBuilder other = (StringStatsAggregationBuilder) obj;
return showDistribution == other.showDistribution;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
package org.elasticsearch.client;

import com.fasterxml.jackson.core.JsonParseException;

import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
Expand Down Expand Up @@ -675,6 +676,7 @@ public void testDefaultNamedXContents() {
List<NamedXContentRegistry.Entry> namedXContents = RestHighLevelClient.getDefaultNamedXContents();
int expectedInternalAggregations = InternalAggregationTestCase.getDefaultNamedXContents().size();
int expectedSuggestions = 3;
assertTrue(namedXContents.removeIf(e -> e.name.getPreferredName().equals("string_stats")));
assertEquals(expectedInternalAggregations + expectedSuggestions, namedXContents.size());
Map<Class<?>, Integer> categories = new HashMap<>();
for (NamedXContentRegistry.Entry namedXContent : namedXContents) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.client.analytics;

import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.support.WriteRequest.RefreshPolicy;
import org.elasticsearch.client.ESRestHighLevelClientTestCase;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.common.xcontent.XContentType;

import java.io.IOException;

import static org.hamcrest.Matchers.aMapWithSize;
import static org.hamcrest.Matchers.closeTo;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.hasEntry;

public class AnalyticsAggsIT extends ESRestHighLevelClientTestCase {
public void testBasic() throws IOException {
BulkRequest bulk = new BulkRequest("test").setRefreshPolicy(RefreshPolicy.IMMEDIATE);
bulk.add(new IndexRequest().source(XContentType.JSON, "message", "trying out elasticsearch"));
bulk.add(new IndexRequest().source(XContentType.JSON, "message", "more words"));
highLevelClient().bulk(bulk, RequestOptions.DEFAULT);
SearchRequest search = new SearchRequest("test");
search.source().aggregation(new StringStatsAggregationBuilder("test").field("message.keyword").showDistribution(true));
SearchResponse response = highLevelClient().search(search, RequestOptions.DEFAULT);
ParsedStringStats stats = response.getAggregations().get("test");
assertThat(stats.getCount(), equalTo(2L));
assertThat(stats.getMinLength(), equalTo(10));
assertThat(stats.getMaxLength(), equalTo(24));
assertThat(stats.getAvgLength(), equalTo(17.0));
assertThat(stats.getEntropy(), closeTo(4, .1));
assertThat(stats.getDistribution(), aMapWithSize(18));
assertThat(stats.getDistribution(), hasEntry(equalTo("o"), closeTo(.09, .005)));
assertThat(stats.getDistribution(), hasEntry(equalTo("r"), closeTo(.12, .005)));
assertThat(stats.getDistribution(), hasEntry(equalTo("t"), closeTo(.09, .005)));
}
}
1 change: 1 addition & 0 deletions docs/java-rest/high-level/aggs-builders.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ This page lists all the available aggregations with their corresponding `Aggrega
| {ref}/search-aggregations-metrics-sum-aggregation.html[Sum] | {agg-ref}/metrics/sum/SumAggregationBuilder.html[SumAggregationBuilder] | {agg-ref}/AggregationBuilders.html#sum-java.lang.String-[AggregationBuilders.sum()]
| {ref}/search-aggregations-metrics-top-hits-aggregation.html[Top hits] | {agg-ref}/metrics/tophits/TopHitsAggregationBuilder.html[TopHitsAggregationBuilder] | {agg-ref}/AggregationBuilders.html#topHits-java.lang.String-[AggregationBuilders.topHits()]
| {ref}/search-aggregations-metrics-valuecount-aggregation.html[Value Count] | {agg-ref}/metrics/valuecount/ValueCountAggregationBuilder.html[ValueCountAggregationBuilder] | {agg-ref}/AggregationBuilders.html#count-java.lang.String-[AggregationBuilders.count()]
| {ref}/search-aggregations-metrics-string-stats-aggregation.html[String Stats] | {javadoc-client}/analytics/StringStatsAggregationBuilder.html[StringStatsAggregationBuilder] | None
|======

==== Bucket Aggregations
Expand Down
Loading

0 comments on commit 2dac36d

Please sign in to comment.