Skip to content

Commit 338aceb

Browse files
committed
[ML] Add field stats to log structure finder (#33351)
The log structure endpoint will return these in addition to pure structure information so that it can be used to drive pre-import data visualizer functionality. The statistics for every field are count, cardinality (distinct count) and top hits (most common values). Extra statistics are calculated if the field is numeric: min, max, mean and median.
1 parent 68b6466 commit 338aceb

File tree

14 files changed

+815
-85
lines changed

14 files changed

+815
-85
lines changed

x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/DelimitedLogStructureFinder.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,9 +123,16 @@ static DelimitedLogStructureFinder makeDelimitedLogStructureFinder(List<String>
123123
.setMultilineStartPattern(timeLineRegex);
124124
}
125125

126-
SortedMap<String, Object> mappings = LogStructureUtils.guessMappings(explanation, sampleRecords);
126+
Tuple<SortedMap<String, Object>, SortedMap<String, FieldStats>> mappingsAndFieldStats =
127+
LogStructureUtils.guessMappingsAndCalculateFieldStats(explanation, sampleRecords);
128+
129+
SortedMap<String, Object> mappings = mappingsAndFieldStats.v1();
127130
mappings.put(LogStructureUtils.DEFAULT_TIMESTAMP_FIELD, Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date"));
128131

132+
if (mappingsAndFieldStats.v2() != null) {
133+
structureBuilder.setFieldStats(mappingsAndFieldStats.v2());
134+
}
135+
129136
LogStructure structure = structureBuilder
130137
.setMappings(mappings)
131138
.setExplanation(explanation)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License;
4+
* you may not use this file except in compliance with the Elastic License.
5+
*/
6+
package org.elasticsearch.xpack.ml.logstructurefinder;
7+
8+
import org.elasticsearch.common.ParseField;
9+
import org.elasticsearch.common.xcontent.ConstructingObjectParser;
10+
import org.elasticsearch.common.xcontent.ToXContentObject;
11+
import org.elasticsearch.common.xcontent.XContentBuilder;
12+
13+
import java.io.IOException;
14+
import java.util.Collections;
15+
import java.util.List;
16+
import java.util.Map;
17+
import java.util.Objects;
18+
19+
public class FieldStats implements ToXContentObject {
20+
21+
static final ParseField COUNT = new ParseField("count");
22+
static final ParseField CARDINALITY = new ParseField("cardinality");
23+
static final ParseField MIN_VALUE = new ParseField("min_value");
24+
static final ParseField MAX_VALUE = new ParseField("max_value");
25+
static final ParseField MEAN_VALUE = new ParseField("mean_value");
26+
static final ParseField MEDIAN_VALUE = new ParseField("median_value");
27+
static final ParseField TOP_HITS = new ParseField("top_hits");
28+
29+
@SuppressWarnings("unchecked")
30+
public static final ConstructingObjectParser<FieldStats, Void> PARSER = new ConstructingObjectParser<>("field_stats", false,
31+
a -> new FieldStats((long) a[0], (int) a[1], (Double) a[2], (Double) a[3], (Double) a[4], (Double) a[5],
32+
(List<Map<String, Object>>) a[6]));
33+
34+
static {
35+
PARSER.declareLong(ConstructingObjectParser.constructorArg(), COUNT);
36+
PARSER.declareInt(ConstructingObjectParser.constructorArg(), CARDINALITY);
37+
PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), MIN_VALUE);
38+
PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), MAX_VALUE);
39+
PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), MEAN_VALUE);
40+
PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), MEDIAN_VALUE);
41+
PARSER.declareObjectArray(ConstructingObjectParser.optionalConstructorArg(), (p, c) -> p.mapOrdered(), TOP_HITS);
42+
}
43+
44+
private final long count;
45+
private final int cardinality;
46+
private final Double minValue;
47+
private final Double maxValue;
48+
private final Double meanValue;
49+
private final Double medianValue;
50+
private final List<Map<String, Object>> topHits;
51+
52+
FieldStats(long count, int cardinality, List<Map<String, Object>> topHits) {
53+
this(count, cardinality, null, null, null, null, topHits);
54+
}
55+
56+
FieldStats(long count, int cardinality, Double minValue, Double maxValue, Double meanValue, Double medianValue,
57+
List<Map<String, Object>> topHits) {
58+
this.count = count;
59+
this.cardinality = cardinality;
60+
this.minValue = minValue;
61+
this.maxValue = maxValue;
62+
this.meanValue = meanValue;
63+
this.medianValue = medianValue;
64+
this.topHits = (topHits == null) ? Collections.emptyList() : Collections.unmodifiableList(topHits);
65+
}
66+
67+
public long getCount() {
68+
return count;
69+
}
70+
71+
public int getCardinality() {
72+
return cardinality;
73+
}
74+
75+
public Double getMinValue() {
76+
return minValue;
77+
}
78+
79+
public Double getMaxValue() {
80+
return maxValue;
81+
}
82+
83+
public Double getMeanValue() {
84+
return meanValue;
85+
}
86+
87+
public Double getMedianValue() {
88+
return medianValue;
89+
}
90+
91+
public List<Map<String, Object>> getTopHits() {
92+
return topHits;
93+
}
94+
95+
@Override
96+
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
97+
98+
builder.startObject();
99+
builder.field(COUNT.getPreferredName(), count);
100+
builder.field(CARDINALITY.getPreferredName(), cardinality);
101+
if (minValue != null) {
102+
builder.field(MIN_VALUE.getPreferredName(), minValue);
103+
}
104+
if (maxValue != null) {
105+
builder.field(MAX_VALUE.getPreferredName(), maxValue);
106+
}
107+
if (meanValue != null) {
108+
builder.field(MEAN_VALUE.getPreferredName(), meanValue);
109+
}
110+
if (medianValue != null) {
111+
builder.field(MEDIAN_VALUE.getPreferredName(), medianValue);
112+
}
113+
if (topHits.isEmpty() == false) {
114+
builder.field(TOP_HITS.getPreferredName(), topHits);
115+
}
116+
builder.endObject();
117+
118+
return builder;
119+
}
120+
121+
@Override
122+
public int hashCode() {
123+
124+
return Objects.hash(count, cardinality, minValue, maxValue, meanValue, medianValue, topHits);
125+
}
126+
127+
@Override
128+
public boolean equals(Object other) {
129+
130+
if (this == other) {
131+
return true;
132+
}
133+
134+
if (other == null || getClass() != other.getClass()) {
135+
return false;
136+
}
137+
138+
FieldStats that = (FieldStats) other;
139+
return this.count == that.count &&
140+
this.cardinality == that.cardinality &&
141+
Objects.equals(this.minValue, that.minValue) &&
142+
Objects.equals(this.maxValue, that.maxValue) &&
143+
Objects.equals(this.meanValue, that.meanValue) &&
144+
Objects.equals(this.medianValue, that.medianValue) &&
145+
Objects.equals(this.topHits, that.topHits);
146+
}
147+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License;
4+
* you may not use this file except in compliance with the Elastic License.
5+
*/
6+
package org.elasticsearch.xpack.ml.logstructurefinder;
7+
8+
import java.util.ArrayList;
9+
import java.util.Collection;
10+
import java.util.Comparator;
11+
import java.util.LinkedHashMap;
12+
import java.util.List;
13+
import java.util.Map;
14+
import java.util.SortedMap;
15+
import java.util.TreeMap;
16+
import java.util.stream.Collectors;
17+
18+
/**
19+
* Calculate statistics for a set of scalar field values.
20+
* Count, cardinality (distinct count) and top hits (most common values) are always calculated.
21+
* Extra statistics are calculated if the field is numeric: min, max, mean and median.
22+
*/
23+
public class FieldStatsCalculator {
24+
25+
private long count;
26+
private SortedMap<String, Integer> countsByStringValue = new TreeMap<>();
27+
private SortedMap<Double, Integer> countsByNumericValue = new TreeMap<>();
28+
29+
/**
30+
* Add a collection of values to the calculator.
31+
* The values to be added can be combined by the caller and added in a
32+
* single call to this method or added in multiple calls to this method.
33+
* @param fieldValues Zero or more values to add. May not be <code>null</code>.
34+
*/
35+
public void accept(Collection<String> fieldValues) {
36+
37+
count += fieldValues.size();
38+
39+
for (String fieldValue : fieldValues) {
40+
41+
countsByStringValue.compute(fieldValue, (k, v) -> (v == null) ? 1 : (1 + v));
42+
43+
if (countsByNumericValue != null) {
44+
45+
try {
46+
countsByNumericValue.compute(Double.valueOf(fieldValue), (k, v) -> (v == null) ? 1 : (1 + v));
47+
} catch (NumberFormatException e) {
48+
countsByNumericValue = null;
49+
}
50+
}
51+
}
52+
}
53+
54+
/**
55+
* Calculate field statistics based on the previously accepted values.
56+
* @param numTopHits The maximum number of entries to include in the top hits.
57+
* @return The calculated field statistics.
58+
*/
59+
public FieldStats calculate(int numTopHits) {
60+
61+
if (countsByNumericValue != null && countsByNumericValue.isEmpty() == false) {
62+
return new FieldStats(count, countsByNumericValue.size(), countsByNumericValue.firstKey(), countsByNumericValue.lastKey(),
63+
calculateMean(), calculateMedian(), findNumericTopHits(numTopHits));
64+
} else {
65+
return new FieldStats(count, countsByStringValue.size(), findStringTopHits(numTopHits));
66+
}
67+
}
68+
69+
Double calculateMean() {
70+
71+
assert countsByNumericValue != null;
72+
73+
if (countsByNumericValue.isEmpty()) {
74+
return null;
75+
}
76+
77+
double runningCount = 0.0;
78+
double runningMean = Double.NaN;
79+
80+
for (Map.Entry<Double, Integer> entry : countsByNumericValue.entrySet()) {
81+
82+
double entryCount = (double) entry.getValue();
83+
double newRunningCount = runningCount + entryCount;
84+
85+
// Updating a running mean like this is more numerically stable than using (sum / count)
86+
if (runningCount > 0.0) {
87+
runningMean = runningMean * (runningCount / newRunningCount) + entry.getKey() * (entryCount / newRunningCount);
88+
} else if (entryCount > 0.0) {
89+
runningMean = entry.getKey();
90+
}
91+
92+
runningCount = newRunningCount;
93+
}
94+
95+
return runningMean;
96+
}
97+
98+
Double calculateMedian() {
99+
100+
assert countsByNumericValue != null;
101+
102+
if (count % 2 == 1) {
103+
104+
// Simple case - median is middle value
105+
long targetCount = count / 2 + 1;
106+
long currentUpperBound = 0;
107+
108+
for (Map.Entry<Double, Integer> entry : countsByNumericValue.entrySet()) {
109+
110+
currentUpperBound += entry.getValue();
111+
112+
if (currentUpperBound >= targetCount) {
113+
return entry.getKey();
114+
}
115+
}
116+
117+
} else {
118+
119+
// More complicated case - median is average of two middle values
120+
long target1Count = count / 2;
121+
long target2Count = target1Count + 1;
122+
double target1Value = Double.NaN;
123+
long prevUpperBound = -1;
124+
long currentUpperBound = 0;
125+
126+
for (Map.Entry<Double, Integer> entry : countsByNumericValue.entrySet()) {
127+
128+
currentUpperBound += entry.getValue();
129+
130+
if (currentUpperBound >= target2Count) {
131+
132+
if (prevUpperBound < target1Count) {
133+
// Both target values are the same
134+
return entry.getKey();
135+
} else {
136+
return (target1Value + entry.getKey()) / 2.0;
137+
}
138+
}
139+
140+
if (currentUpperBound >= target1Count) {
141+
target1Value = entry.getKey();
142+
}
143+
144+
prevUpperBound = currentUpperBound;
145+
}
146+
}
147+
148+
return null;
149+
}
150+
151+
List<Map<String, Object>> findNumericTopHits(int numTopHits) {
152+
assert countsByNumericValue != null;
153+
return findTopHits(numTopHits, countsByNumericValue, Comparator.comparing(Map.Entry<Double, Integer>::getKey));
154+
}
155+
156+
List<Map<String, Object>> findStringTopHits(int numTopHits) {
157+
return findTopHits(numTopHits, countsByStringValue, Comparator.comparing(Map.Entry<String, Integer>::getKey));
158+
}
159+
160+
/**
161+
* Order by descending count, with a secondary sort to ensure reproducibility of results.
162+
*/
163+
private static <T> List<Map<String, Object>> findTopHits(int numTopHits, Map<T, Integer> countsByValue,
164+
Comparator<Map.Entry<T, Integer>> secondarySort) {
165+
166+
List<Map.Entry<T, Integer>> sortedByCount = countsByValue.entrySet().stream()
167+
.sorted(Comparator.comparing(Map.Entry<T, Integer>::getValue, Comparator.reverseOrder()).thenComparing(secondarySort))
168+
.limit(numTopHits).collect(Collectors.toList());
169+
170+
List<Map<String, Object>> topHits = new ArrayList<>(sortedByCount.size());
171+
172+
for (Map.Entry<T, Integer> entry : sortedByCount) {
173+
174+
Map<String, Object> topHit = new LinkedHashMap<>(3);
175+
topHit.put("value", entry.getKey());
176+
topHit.put("count", entry.getValue());
177+
topHits.add(topHit);
178+
}
179+
180+
return topHits;
181+
}
182+
}

0 commit comments

Comments
 (0)