Skip to content

[ML] Add support for date_nanos fields in find_file_structure #62048

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -149,14 +149,15 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List<String
.setJavaTimestampFormats(timeField.v2().getJavaTimestampFormats())
.setNeedClientTimezone(needClientTimeZone)
.setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), csvProcessorSettings,
mappings, timeField.v1(), timeField.v2().getJavaTimestampFormats(), needClientTimeZone))
mappings, timeField.v1(), timeField.v2().getJavaTimestampFormats(), needClientTimeZone,
timeField.v2().needNanosecondPrecision()))
.setMultilineStartPattern(makeMultilineStartPattern(explanation, columnNamesList, maxLinesPerMessage, delimiterPattern,
quotePattern, mappings, timeField.v1(), timeField.v2()));

mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT);
mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, timeField.v2().getEsDateMappingTypeWithoutFormat());
} else {
structureBuilder.setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(),
csvProcessorSettings, mappings, null, null, false));
csvProcessorSettings, mappings, null, null, false, false));
structureBuilder.setMultilineStartPattern(makeMultilineStartPattern(explanation, columnNamesList, maxLinesPerMessage,
delimiterPattern, quotePattern, mappings, null, null));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ public final class FileStructureUtils {
public static final String MAPPING_PROPERTIES_SETTING = "properties";
public static final Map<String, String> DATE_MAPPING_WITHOUT_FORMAT =
Collections.singletonMap(MAPPING_TYPE_SETTING, "date");
public static final String NANOSECOND_DATE_OUTPUT_FORMAT = "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSSXXX";
public static final Set<String> CONVERTIBLE_TYPES =
Collections.unmodifiableSet(Sets.newHashSet("integer", "long", "float", "double", "boolean"));

Expand Down Expand Up @@ -397,13 +398,15 @@ static boolean isMoreLikelyTextThanKeyword(String str) {
* @param timestampFormats Timestamp formats to be used for parsing {@code timestampField}.
* May be <code>null</code> if {@code timestampField} is also <code>null</code>.
* @param needClientTimezone Is the timezone of the client supplying data to ingest required to uniquely parse the timestamp?
* @param needNanosecondPrecision Does the timestamp have more than millisecond accuracy?
* @return The ingest pipeline definition, or <code>null</code> if none is required.
*/
public static Map<String, Object> makeIngestPipelineDefinition(String grokPattern, Map<String, String> customGrokPatternDefinitions,
Map<String, Object> csvProcessorSettings,
Map<String, Object> mappingsForConversions,
String timestampField, List<String> timestampFormats,
boolean needClientTimezone) {
boolean needClientTimezone,
boolean needNanosecondPrecision) {

if (grokPattern == null && csvProcessorSettings == null && timestampField == null) {
return null;
Expand Down Expand Up @@ -437,6 +440,9 @@ public static Map<String, Object> makeIngestPipelineDefinition(String grokPatter
dateProcessorSettings.put("timezone", "{{ " + BEAT_TIMEZONE_FIELD + " }}");
}
dateProcessorSettings.put("formats", timestampFormats);
if (needNanosecondPrecision) {
dateProcessorSettings.put("output_format", NANOSECOND_DATE_OUTPUT_FORMAT);
}
processors.add(Collections.singletonMap("date", dateProcessorSettings));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,15 +64,16 @@ static NdJsonFileStructureFinder makeNdJsonFileStructureFinder(List<String> expl
.setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), null,
// Note: no convert processors are added based on mappings for NDJSON input
// because it's reasonable that _source matches the supplied JSON precisely
Collections.emptyMap(), timeField.v1(), timeField.v2().getJavaTimestampFormats(), needClientTimeZone));
Collections.emptyMap(), timeField.v1(), timeField.v2().getJavaTimestampFormats(), needClientTimeZone,
timeField.v2().needNanosecondPrecision()));
}

Tuple<SortedMap<String, Object>, SortedMap<String, FieldStats>> mappingsAndFieldStats =
FileStructureUtils.guessMappingsAndCalculateFieldStats(explanation, sampleRecords, timeoutChecker);

SortedMap<String, Object> mappings = mappingsAndFieldStats.v1();
Map<String, Object> mappings = mappingsAndFieldStats.v1();
if (timeField != null) {
mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT);
mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, timeField.v2().getEsDateMappingTypeWithoutFormat());
}

if (mappingsAndFieldStats.v2() != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List<String> ex
Map<String, String> messageMapping = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "text");
SortedMap<String, Object> mappings = new TreeMap<>();
mappings.put("message", messageMapping);
mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT);
mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, timestampFormatFinder.getEsDateMappingTypeWithoutFormat());

SortedMap<String, FieldStats> fieldStats = new TreeMap<>();
fieldStats.put("message", FileStructureUtils.calculateFieldStats(messageMapping, sampleMessages, timeoutChecker));
Expand Down Expand Up @@ -151,7 +151,8 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List<String> ex
.setNeedClientTimezone(needClientTimeZone)
.setGrokPattern(grokPattern)
.setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(grokPattern, customGrokPatternDefinitions, null, mappings,
interimTimestampField, timestampFormatFinder.getJavaTimestampFormats(), needClientTimeZone))
interimTimestampField, timestampFormatFinder.getJavaTimestampFormats(), needClientTimeZone,
timestampFormatFinder.needNanosecondPrecision()))
.setMappings(mappings)
.setFieldStats(fieldStats)
.setExplanation(explanation)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ public final class TimestampFormatFinder {
private static final Logger logger = LogManager.getLogger(TimestampFormatFinder.class);
private static final String PUNCTUATION_THAT_NEEDS_ESCAPING_IN_REGEX = "\\|()[]{}^$.*?";
private static final String FRACTIONAL_SECOND_SEPARATORS = ":.,";
private static final Pattern FRACTIONAL_SECOND_INTERPRETER =
Pattern.compile("([" + FRACTIONAL_SECOND_SEPARATORS + "])(\\d{3,9})($|[Z+-])");
private static final char INDETERMINATE_FIELD_PLACEHOLDER = '?';
// The ? characters in this must match INDETERMINATE_FIELD_PLACEHOLDER
// above, but they're literals in this regex to aid readability
Expand Down Expand Up @@ -702,6 +704,20 @@ public List<String> getJavaTimestampFormats() {
(matchedFormats.size() > 1) ? matchedFormats.get(0) : null);
}

/**
* This is needed to decide between "date" and "date_nanos" as the index mapping type.
* @return Do the observed timestamps require nanosecond precision to store accurately?
*/
public boolean needNanosecondPrecision() {
if (matchedFormats.isEmpty()) {
// If errorOnNoTimestamp is set and we get here it means no samples have been added, which is likely a programmer mistake
assert errorOnNoTimestamp == false;
return false;
}
return matches.stream().filter(match -> matchedFormats.size() < 2 || matchedFormats.get(0).canMergeWith(match.timestampFormat))
.anyMatch(match -> match.hasNanosecondPrecision);
}

/**
* Given a list of timestamp formats that might contain indeterminate day/month parts,
* return the corresponding pattern with the placeholders replaced with concrete
Expand Down Expand Up @@ -947,6 +963,14 @@ public boolean hasTimezoneDependentParsing() {
.anyMatch(match -> match.hasTimezoneDependentParsing);
}

/**
* The @timestamp field will always have been parsed into epoch format,
* so we just need to know if it has nanosecond resolution or not.
*/
public Map<String, String> getEsDateMappingTypeWithoutFormat() {
return Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, needNanosecondPrecision() ? "date_nanos" : "date");
}

/**
* Sometimes Elasticsearch mappings for dates need to include the format.
* This method returns appropriate mappings settings: at minimum "type" : "date",
Expand All @@ -959,7 +983,7 @@ public Map<String, String> getEsDateMappingTypeWithFormat() {
return Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword");
}
Map<String, String> mapping = new LinkedHashMap<>();
mapping.put(FileStructureUtils.MAPPING_TYPE_SETTING, "date");
mapping.put(FileStructureUtils.MAPPING_TYPE_SETTING, needNanosecondPrecision() ? "date_nanos" : "date");
String formats = javaTimestampFormats.stream().map(format -> {
switch (format) {
case "ISO8601":
Expand Down Expand Up @@ -1233,6 +1257,7 @@ static final class TimestampMatch {
final int secondIndeterminateDateNumber;

final boolean hasTimezoneDependentParsing;
final boolean hasNanosecondPrecision;

/**
* Text that came after the timestamp in the matched field/message.
Expand All @@ -1250,6 +1275,8 @@ static final class TimestampMatch {
this.secondIndeterminateDateNumber = indeterminateDateNumbers[1];
this.hasTimezoneDependentParsing = requiresTimezoneDependentParsing(timestampFormat.rawJavaTimestampFormats.get(0),
matchedDate);
this.hasNanosecondPrecision = matchHasNanosecondPrecision(timestampFormat.rawJavaTimestampFormats.get(0),
matchedDate);
this.epilogue = Objects.requireNonNull(epilogue);
}

Expand All @@ -1259,6 +1286,7 @@ static final class TimestampMatch {
this.firstIndeterminateDateNumber = toCopyExceptFormat.firstIndeterminateDateNumber;
this.secondIndeterminateDateNumber = toCopyExceptFormat.secondIndeterminateDateNumber;
this.hasTimezoneDependentParsing = toCopyExceptFormat.hasTimezoneDependentParsing;
this.hasNanosecondPrecision = toCopyExceptFormat.hasNanosecondPrecision;
this.epilogue = toCopyExceptFormat.epilogue;
}

Expand All @@ -1285,6 +1313,43 @@ static boolean requiresTimezoneDependentParsing(String format, String matchedDat
}
}

static boolean matchHasNanosecondPrecision(String format, String matchedDate) {
switch (format) {
case "ISO8601":
Matcher matcher = FRACTIONAL_SECOND_INTERPRETER.matcher(matchedDate);
return matcher.find() && matcher.group(2).length() > 3;
case "UNIX_MS":
case "UNIX":
return false;
case "TAI64N":
return true;
default:
boolean notQuoted = true;
int consecutiveSs = 0;
for (int pos = 0; pos < format.length(); ++pos) {
char curChar = format.charAt(pos);
if (curChar == '\'') {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just double-checking: can there be a quotation inside another quotation?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A literal ' is specified by two consecutive single quotes - see https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html. This means that no special logic is needed in this method. If there is a literal single quote then this method will treat it as an empty literal section, which is technically wrong but achieves the desired effect.

But since this is quite subtle I will add a comment to say what is going on.

// Literal single quotes are escaped by using two consecutive single quotes.
// Technically this code does the wrong thing in this case, as it flips quoting
// from off to on or on to off and then back. However, since by definition there
// is nothing in between the consecutive single quotes in this case, the net
// effect is correct and good enough for what this method is doing.
notQuoted = !notQuoted;
consecutiveSs = 0;
} else if (notQuoted) {
if (curChar == 'S') {
if (++consecutiveSs > 3) {
return true;
}
} else {
consecutiveSs = 0;
}
}
}
return false;
}
}

static int[] parseIndeterminateDateNumbers(String matchedDate, List<String> rawJavaTimestampFormats) {
int[] indeterminateDateNumbers = { -1, -1 };

Expand Down Expand Up @@ -1368,7 +1433,6 @@ public String toString() {
*/
static final class CandidateTimestampFormat {

private static final Pattern FRACTIONAL_SECOND_INTERPRETER = Pattern.compile("([" + FRACTIONAL_SECOND_SEPARATORS + "])(\\d{3,9})$");
// This means that in the case of a literal Z, XXX is preferred
private static final Pattern TRAILING_OFFSET_WITHOUT_COLON_FINDER = Pattern.compile("[+-]\\d{4}$");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ static XmlFileStructureFinder makeXmlFileStructureFinder(List<String> explanatio
.setNeedClientTimezone(needClientTimeZone)
.setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), null,
Collections.emptyMap(), topLevelTag + "." + timeField.v1(), timeField.v2().getJavaTimestampFormats(),
needClientTimeZone));
needClientTimeZone, timeField.v2().needNanosecondPrecision()));
}

Tuple<SortedMap<String, Object>, SortedMap<String, FieldStats>> mappingsAndFieldStats =
Expand All @@ -114,14 +114,14 @@ static XmlFileStructureFinder makeXmlFileStructureFinder(List<String> explanatio
structureBuilder.setFieldStats(mappingsAndFieldStats.v2());
}

SortedMap<String, Object> innerMappings = mappingsAndFieldStats.v1();
Map<String, Object> innerMappings = mappingsAndFieldStats.v1();
Map<String, Object> secondLevelProperties = new LinkedHashMap<>();
secondLevelProperties.put(FileStructureUtils.MAPPING_TYPE_SETTING, "object");
secondLevelProperties.put(FileStructureUtils.MAPPING_PROPERTIES_SETTING, innerMappings);
SortedMap<String, Object> outerMappings = new TreeMap<>();
outerMappings.put(topLevelTag, secondLevelProperties);
if (timeField != null) {
outerMappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT);
outerMappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, timeField.v2().getEsDateMappingTypeWithoutFormat());
}

FileStructure structure = structureBuilder
Expand Down
Loading