elastic
diff --git a/‎x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/DelimitedLogStructureFinder.java
Lines changed: 8 additions & 1 deletion b/‎x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/DelimitedLogStructureFinder.java
Lines changed: 8 additions & 1 deletion
diff --git a/‎x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/FieldStats.java
Lines changed: 147 additions & 0 deletions b/‎x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/FieldStats.java
Lines changed: 147 additions & 0 deletions
diff --git a/‎x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/FieldStatsCalculator.java
Lines changed: 182 additions & 0 deletions b/‎x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/FieldStatsCalculator.java
Lines changed: 182 additions & 0 deletions
@@ -123,9 +123,16 @@ static DelimitedLogStructureFinder makeDelimitedLogStructureFinder(List<String>
                 .setMultilineStartPattern(timeLineRegex);
         }
 
-        SortedMap<String, Object> mappings = LogStructureUtils.guessMappings(explanation, sampleRecords);
+        Tuple<SortedMap<String, Object>, SortedMap<String, FieldStats>> mappingsAndFieldStats =
+            LogStructureUtils.guessMappingsAndCalculateFieldStats(explanation, sampleRecords);
+
+        SortedMap<String, Object> mappings = mappingsAndFieldStats.v1();
         mappings.put(LogStructureUtils.DEFAULT_TIMESTAMP_FIELD, Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date"));
 
+        if (mappingsAndFieldStats.v2() != null) {
+            structureBuilder.setFieldStats(mappingsAndFieldStats.v2());
+        }
+
         LogStructure structure = structureBuilder
             .setMappings(mappings)
             .setExplanation(explanation)
 
@@ -0,0 +1,147 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import org.elasticsearch.common.ParseField;
+import org.elasticsearch.common.xcontent.ConstructingObjectParser;
+import org.elasticsearch.common.xcontent.ToXContentObject;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+
+public class FieldStats implements ToXContentObject {
+
+    static final ParseField COUNT = new ParseField("count");
+    static final ParseField CARDINALITY = new ParseField("cardinality");
+    static final ParseField MIN_VALUE = new ParseField("min_value");
+    static final ParseField MAX_VALUE = new ParseField("max_value");
+    static final ParseField MEAN_VALUE = new ParseField("mean_value");
+    static final ParseField MEDIAN_VALUE = new ParseField("median_value");
+    static final ParseField TOP_HITS = new ParseField("top_hits");
+
+    @SuppressWarnings("unchecked")
+    public static final ConstructingObjectParser<FieldStats, Void> PARSER = new ConstructingObjectParser<>("field_stats", false,
+        a -> new FieldStats((long) a[0], (int) a[1], (Double) a[2], (Double) a[3], (Double) a[4], (Double) a[5],
+            (List<Map<String, Object>>) a[6]));
+
+    static {
+        PARSER.declareLong(ConstructingObjectParser.constructorArg(), COUNT);
+        PARSER.declareInt(ConstructingObjectParser.constructorArg(), CARDINALITY);
+        PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), MIN_VALUE);
+        PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), MAX_VALUE);
+        PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), MEAN_VALUE);
+        PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), MEDIAN_VALUE);
+        PARSER.declareObjectArray(ConstructingObjectParser.optionalConstructorArg(), (p, c) -> p.mapOrdered(), TOP_HITS);
+    }
+
+    private final long count;
+    private final int cardinality;
+    private final Double minValue;
+    private final Double maxValue;
+    private final Double meanValue;
+    private final Double medianValue;
+    private final List<Map<String, Object>> topHits;
+
+    FieldStats(long count, int cardinality, List<Map<String, Object>> topHits) {
+        this(count, cardinality, null, null, null, null, topHits);
+    }
+
+    FieldStats(long count, int cardinality, Double minValue, Double maxValue, Double meanValue, Double medianValue,
+               List<Map<String, Object>> topHits) {
+        this.count = count;
+        this.cardinality = cardinality;
+        this.minValue = minValue;
+        this.maxValue = maxValue;
+        this.meanValue = meanValue;
+        this.medianValue = medianValue;
+        this.topHits = (topHits == null) ? Collections.emptyList() : Collections.unmodifiableList(topHits);
+    }
+
+    public long getCount() {
+        return count;
+    }
+
+    public int getCardinality() {
+        return cardinality;
+    }
+
+    public Double getMinValue() {
+        return minValue;
+    }
+
+    public Double getMaxValue() {
+        return maxValue;
+    }
+
+    public Double getMeanValue() {
+        return meanValue;
+    }
+
+    public Double getMedianValue() {
+        return medianValue;
+    }
+
+    public List<Map<String, Object>> getTopHits() {
+        return topHits;
+    }
+
+    @Override
+    public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
+
+        builder.startObject();
+        builder.field(COUNT.getPreferredName(), count);
+        builder.field(CARDINALITY.getPreferredName(), cardinality);
+        if (minValue != null) {
+            builder.field(MIN_VALUE.getPreferredName(), minValue);
+        }
+        if (maxValue != null) {
+            builder.field(MAX_VALUE.getPreferredName(), maxValue);
+        }
+        if (meanValue != null) {
+            builder.field(MEAN_VALUE.getPreferredName(), meanValue);
+        }
+        if (medianValue != null) {
+            builder.field(MEDIAN_VALUE.getPreferredName(), medianValue);
+        }
+        if (topHits.isEmpty() == false) {
+            builder.field(TOP_HITS.getPreferredName(), topHits);
+        }
+        builder.endObject();
+
+        return builder;
+    }
+
+    @Override
+    public int hashCode() {
+
+        return Objects.hash(count, cardinality, minValue, maxValue, meanValue, medianValue, topHits);
+    }
+
+    @Override
+    public boolean equals(Object other) {
+
+        if (this == other) {
+            return true;
+        }
+
+        if (other == null || getClass() != other.getClass()) {
+            return false;
+        }
+
+        FieldStats that = (FieldStats) other;
+        return this.count == that.count &&
+            this.cardinality == that.cardinality &&
+            Objects.equals(this.minValue, that.minValue) &&
+            Objects.equals(this.maxValue, that.maxValue) &&
+            Objects.equals(this.meanValue, that.meanValue) &&
+            Objects.equals(this.medianValue, that.medianValue) &&
+            Objects.equals(this.topHits, that.topHits);
+    }
+}
@@ -0,0 +1,182 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
+import java.util.stream.Collectors;
+
+/**
+ * Calculate statistics for a set of scalar field values.
+ * Count, cardinality (distinct count) and top hits (most common values) are always calculated.
+ * Extra statistics are calculated if the field is numeric: min, max, mean and median.
+ */
+public class FieldStatsCalculator {
+
+    private long count;
+    private SortedMap<String, Integer> countsByStringValue = new TreeMap<>();
+    private SortedMap<Double, Integer> countsByNumericValue = new TreeMap<>();
+
+    /**
+     * Add a collection of values to the calculator.
+     * The values to be added can be combined by the caller and added in a
+     * single call to this method or added in multiple calls to this method.
+     * @param fieldValues Zero or more values to add.  May not be <code>null</code>.
+     */
+    public void accept(Collection<String> fieldValues) {
+
+        count += fieldValues.size();
+
+        for (String fieldValue : fieldValues) {
+
+            countsByStringValue.compute(fieldValue, (k, v) -> (v == null) ? 1 : (1 + v));
+
+            if (countsByNumericValue != null) {
+
+                try {
+                    countsByNumericValue.compute(Double.valueOf(fieldValue), (k, v) -> (v == null) ? 1 : (1 + v));
+                } catch (NumberFormatException e) {
+                    countsByNumericValue = null;
+                }
+            }
+        }
+    }
+
+    /**
+     * Calculate field statistics based on the previously accepted values.
+     * @param numTopHits The maximum number of entries to include in the top hits.
+     * @return The calculated field statistics.
+     */
+    public FieldStats calculate(int numTopHits) {
+
+        if (countsByNumericValue != null && countsByNumericValue.isEmpty() == false) {
+            return new FieldStats(count, countsByNumericValue.size(), countsByNumericValue.firstKey(), countsByNumericValue.lastKey(),
+                calculateMean(), calculateMedian(), findNumericTopHits(numTopHits));
+        } else {
+            return new FieldStats(count, countsByStringValue.size(), findStringTopHits(numTopHits));
+        }
+    }
+
+    Double calculateMean() {
+
+        assert countsByNumericValue != null;
+
+        if (countsByNumericValue.isEmpty()) {
+            return null;
+        }
+
+        double runningCount = 0.0;
+        double runningMean = Double.NaN;
+
+        for (Map.Entry<Double, Integer> entry : countsByNumericValue.entrySet()) {
+
+            double entryCount = (double) entry.getValue();
+            double newRunningCount = runningCount + entryCount;
+
+            // Updating a running mean like this is more numerically stable than using (sum / count)
+            if (runningCount > 0.0) {
+                runningMean = runningMean * (runningCount / newRunningCount) + entry.getKey() * (entryCount / newRunningCount);
+            } else if (entryCount > 0.0) {
+                runningMean = entry.getKey();
+            }
+
+            runningCount = newRunningCount;
+        }
+
+        return runningMean;
+    }
+
+    Double calculateMedian() {
+
+        assert countsByNumericValue != null;
+
+        if (count % 2 == 1) {
+
+            // Simple case - median is middle value
+            long targetCount = count / 2 + 1;
+            long currentUpperBound = 0;
+
+            for (Map.Entry<Double, Integer> entry : countsByNumericValue.entrySet()) {
+
+                currentUpperBound += entry.getValue();
+
+                if (currentUpperBound >= targetCount) {
+                    return entry.getKey();
+                }
+            }
+
+        } else {
+
+            // More complicated case - median is average of two middle values
+            long target1Count = count / 2;
+            long target2Count = target1Count + 1;
+            double target1Value = Double.NaN;
+            long prevUpperBound = -1;
+            long currentUpperBound = 0;
+
+            for (Map.Entry<Double, Integer> entry : countsByNumericValue.entrySet()) {
+
+                currentUpperBound += entry.getValue();
+
+                if (currentUpperBound >= target2Count) {
+
+                    if (prevUpperBound < target1Count) {
+                        // Both target values are the same
+                        return entry.getKey();
+                    } else {
+                        return (target1Value + entry.getKey()) / 2.0;
+                    }
+                }
+
+                if (currentUpperBound >= target1Count) {
+                    target1Value = entry.getKey();
+                }
+
+                prevUpperBound = currentUpperBound;
+            }
+        }
+
+        return null;
+    }
+
+    List<Map<String, Object>> findNumericTopHits(int numTopHits) {
+        assert countsByNumericValue != null;
+        return findTopHits(numTopHits, countsByNumericValue, Comparator.comparing(Map.Entry<Double, Integer>::getKey));
+    }
+
+    List<Map<String, Object>> findStringTopHits(int numTopHits) {
+        return findTopHits(numTopHits, countsByStringValue, Comparator.comparing(Map.Entry<String, Integer>::getKey));
+    }
+
+    /**
+     * Order by descending count, with a secondary sort to ensure reproducibility of results.
+     */
+    private static <T> List<Map<String, Object>> findTopHits(int numTopHits, Map<T, Integer> countsByValue,
+                                                             Comparator<Map.Entry<T, Integer>> secondarySort) {
+
+        List<Map.Entry<T, Integer>> sortedByCount = countsByValue.entrySet().stream()
+            .sorted(Comparator.comparing(Map.Entry<T, Integer>::getValue, Comparator.reverseOrder()).thenComparing(secondarySort))
+            .limit(numTopHits).collect(Collectors.toList());
+
+        List<Map<String, Object>> topHits = new ArrayList<>(sortedByCount.size());
+
+        for (Map.Entry<T, Integer> entry : sortedByCount) {
+
+            Map<String, Object> topHit = new LinkedHashMap<>(3);
+            topHit.put("value", entry.getKey());
+            topHit.put("count", entry.getValue());
+            topHits.add(topHit);
+        }
+
+        return topHits;
+    }
+}