Skip to content

[7.x] Add supports for upper and lower values on boxplot based on the IQR value (#63617) #64611

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Nov 5, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,21 @@ The response will look like this:
"max": 990.0,
"q1": 165.0,
"q2": 445.0,
"q3": 725.0
"q3": 725.0,
"lower": 0.0,
"upper": 990.0
}
}
}
--------------------------------------------------
// TESTRESPONSE[s/\.\.\./"took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits,/]

In this case, the lower and upper whisker values are equal to the min and max. In general, these values are the 1.5 *
IQR range, which is to say the nearest values to `q1 - (1.5 * IQR)` and `q3 + (1.5 * IQR)`. Since this is an approximation, the given values
may not actually be observed values from the data, but should be within a reasonable error bound of them. While the Boxplot aggregation
doesn't directly return outlier points, you can check if `lower > min` or `upper < max` to see if outliers exist on either side, and then
query for them directly.

==== Script

The boxplot metric supports scripting. For example, if our load times
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

package org.elasticsearch.xpack.analytics.boxplot;

import com.tdunning.math.stats.Centroid;

import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.xcontent.XContentBuilder;
Expand All @@ -24,9 +26,91 @@

public class InternalBoxplot extends InternalNumericMetricsAggregation.MultiValue implements Boxplot {

/**
* This value is used in determining the width of the whiskers of the boxplot. After the IQR value is calculated, it gets multiplied
* by this multiplier to decide how far out from the q1 and q3 points to extend the whiskers. The value of 1.5 is traditional.
* See https://en.wikipedia.org/wiki/Box_plot
*/
public static final double IQR_MULTIPLIER = 1.5;

enum Metrics {
MIN {
@Override
double value(InternalBoxplot boxplot) {
return boxplot.getMin();
}

@Override
double value(TDigestState state) {
return state == null ? Double.NEGATIVE_INFINITY : state.getMin();
}
},
MAX {
@Override
double value(InternalBoxplot boxplot) {
return boxplot.getMax();
}

@Override
double value(TDigestState state) {
return state == null ? Double.POSITIVE_INFINITY : state.getMax();
}
},
Q1 {
@Override
double value(InternalBoxplot boxplot) {
return boxplot.getQ1();
}

@Override
double value(TDigestState state) {
return state == null ? Double.NaN : state.quantile(0.25);
}
},
Q2 {
@Override
double value(InternalBoxplot boxplot) {
return boxplot.getQ2();
}

@Override
double value(TDigestState state) {
return state == null ? Double.NaN : state.quantile(0.5);
}
},
Q3 {
@Override
double value(InternalBoxplot boxplot) {
return boxplot.getQ3();
}

@Override
double value(TDigestState state) {
return state == null ? Double.NaN : state.quantile(0.75);
}
},
LOWER {
@Override
double value(InternalBoxplot boxplot) {
return whiskers(boxplot.state)[0];
}

MIN, MAX, Q1, Q2, Q3;
@Override
double value(TDigestState state) {
return whiskers(state)[0];
}
},
UPPER {
@Override
double value(InternalBoxplot boxplot) {
return whiskers(boxplot.state)[1];
}

@Override
double value(TDigestState state) {
return whiskers(state)[1];
}
};

public static Metrics resolve(String name) {
return Metrics.valueOf(name.toUpperCase(Locale.ROOT));
Expand All @@ -36,39 +120,48 @@ public String value() {
return name().toLowerCase(Locale.ROOT);
}

double value(InternalBoxplot boxplot) {
switch (this) {
case MIN:
return boxplot.getMin();
case MAX:
return boxplot.getMax();
case Q1:
return boxplot.getQ1();
case Q2:
return boxplot.getQ2();
case Q3:
return boxplot.getQ3();
default:
throw new IllegalArgumentException("Unknown value [" + this.value() + "] in the boxplot aggregation");
}
abstract double value(InternalBoxplot boxplot);

abstract double value(TDigestState state);
}

/**
* For a given TDigest, find the "whisker" valeus, such that the upper whisker is (close to) the highest observed value less than
* q3 + 1.5 * IQR and the lower whisker is (close to) the lowest observed value greater than q1 - 1.5 * IQR. Since we don't track
* observed values directly, this function returns the centroid according to the above logic.
*
* @param state - an initialized TDigestState representing the observed data.
* @return - two doubles in an array, where whiskers[0] is the lower whisker and whiskers[1] is the upper whisker.
*/
public static double[] whiskers(TDigestState state) {
double[] results = new double[2];
results[0] = Double.NaN;
results[1] = Double.NaN;
if (state == null) {
return results;
}

double value(TDigestState state) {
switch (this) {
case MIN:
return state == null ? Double.NEGATIVE_INFINITY : state.getMin();
case MAX:
return state == null ? Double.POSITIVE_INFINITY : state.getMax();
case Q1:
return state == null ? Double.NaN : state.quantile(0.25);
case Q2:
return state == null ? Double.NaN : state.quantile(0.5);
case Q3:
return state == null ? Double.NaN : state.quantile(0.75);
default:
throw new IllegalArgumentException("Unknown value [" + this.value() + "] in the boxplot aggregation");
double q3 = state.quantile(0.75);
double q1 = state.quantile(0.25);
double iqr = q3 - q1;
double upper = q3 + (IQR_MULTIPLIER * iqr);
double lower = q1 - (IQR_MULTIPLIER * iqr);
Centroid prev = null;
// Does this iterate in ascending order? if not, we might need to sort...
for (Centroid c : state.centroids()) {
if (Double.isNaN(results[0]) && c.mean() > lower) {
results[0] = c.mean();
}
if (c.mean() > upper) {
results[1] = prev.mean();
break;
}
prev = c;
}
if (Double.isNaN(results[1])) {
results[1] = state.getMax();
}
return results;
}

public static List<String> metricNames = Stream.of(Metrics.values())
Expand Down Expand Up @@ -188,17 +281,22 @@ public InternalBoxplot reduce(List<InternalAggregation> aggregations, ReduceCont

@Override
public XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException {
double[] whiskers = whiskers(state);
builder.field("min", getMin());
builder.field("max", getMax());
builder.field("q1", getQ1());
builder.field("q2", getQ2());
builder.field("q3", getQ3());
builder.field("lower", whiskers[0]);
builder.field("upper", whiskers[1]);
if (format != DocValueFormat.RAW) {
builder.field("min_as_string", format.format(getMin()));
builder.field("max_as_string", format.format(getMax()));
builder.field("q1_as_string", format.format(getQ1()));
builder.field("q2_as_string", format.format(getQ2()));
builder.field("q3_as_string", format.format(getQ3()));
builder.field("lower_as_string", format.format(whiskers[0]));
builder.field("upper_as_string", format.format(whiskers[1]));
}
return builder;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,15 +113,35 @@ protected List<NamedXContentRegistry.Entry> getNamedXContents() {
return extendedNamedXContents;
}

public void testIQR() {
double epsilon = 0.00001; // tolerance on equality for doubles
TDigestState state = new TDigestState(100);
for (double value : org.elasticsearch.common.collect.List.of(52, 57, 57, 58, 63, 66, 66, 67, 67, 68, 69, 70, 70, 70, 70, 72, 73, 75,
75, 76, 76, 78, 79, 89)) {
state.add(value);
}
double[] actual = InternalBoxplot.whiskers(state);
assertEquals(57.0, actual[0], epsilon);
assertEquals(79.0, actual[1], epsilon);

// Test null state
actual = InternalBoxplot.whiskers(null);
assertNotNull(actual);
assertTrue(Double.isNaN(actual[0]));
assertTrue(Double.isNaN(actual[1]));
}

public void testIterator() {
InternalBoxplot aggregation = createTestInstance("test", emptyMap());
List<String> names = StreamSupport.stream(aggregation.valueNames().spliterator(), false).collect(Collectors.toList());

assertEquals(5, names.size());
assertEquals(7, names.size());
assertTrue(names.contains("min"));
assertTrue(names.contains("max"));
assertTrue(names.contains("q1"));
assertTrue(names.contains("q2"));
assertTrue(names.contains("q3"));
assertTrue(names.contains("lower"));
assertTrue(names.contains("upper"));
}
}