Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ML] Minor improvements to categorization Grok pattern creation #33353

Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,15 @@
*/
public final class GrokPatternCreator {

private static String PREFACE = "preface";
private static String EPILOGUE = "epilogue";
private static final String PREFACE = "preface";
private static final String EPILOGUE = "epilogue";

/**
* The first match in this list will be chosen, so it needs to be ordered
* such that more generic patterns come after more specific patterns.
*/
private static final List<GrokPatternCandidate> ORDERED_CANDIDATE_GROK_PATTERNS = Arrays.asList(
new GrokPatternCandidate("TOMCAT_DATESTAMP", "timestamp"),
new GrokPatternCandidate("TIMESTAMP_ISO8601", "timestamp"),
new GrokPatternCandidate("DATESTAMP_RFC822", "timestamp"),
new GrokPatternCandidate("DATESTAMP_RFC2822", "timestamp"),
Expand All @@ -41,7 +42,6 @@ public final class GrokPatternCreator {
new GrokPatternCandidate("SYSLOGTIMESTAMP", "timestamp"),
new GrokPatternCandidate("HTTPDATE", "timestamp"),
new GrokPatternCandidate("CATALINA_DATESTAMP", "timestamp"),
new GrokPatternCandidate("TOMCAT_DATESTAMP", "timestamp"),
new GrokPatternCandidate("CISCOTIMESTAMP", "timestamp"),
new GrokPatternCandidate("DATE", "date"),
new GrokPatternCandidate("TIME", "time"),
Expand All @@ -56,12 +56,10 @@ public final class GrokPatternCreator {
new GrokPatternCandidate("IP", "ipaddress"),
// This already includes pre/post break conditions
new GrokPatternCandidate("QUOTEDSTRING", "field", "", ""),
// Can't use \b as the break before, because it doesn't work for negative numbers (the
// minus sign is not a "word" character)
new GrokPatternCandidate("NUMBER", "field", "(?<!\\w)"),
// Disallow +, - and . before hex numbers, otherwise this pattern will pick up base 10
// numbers that NUMBER rejected due to preceeding characters
new GrokPatternCandidate("BASE16NUM", "field", "(?<![\\w.+-])")
// Disallow +, - and . before numbers, as well as "word" characters, otherwise we'll pick
// up numeric suffices too eagerly
new GrokPatternCandidate("NUMBER", "field", "(?<![\\w.+-])", "(?![\\w+-]|\\.\\d)"),
new GrokPatternCandidate("BASE16NUM", "field", "(?<![\\w.+-])", "(?![\\w+-]|\\.\\w)")
// TODO: also unfortunately can't have USERNAME in the list as it matches too broadly
// Fixing these problems with overly broad matches would require some extra intelligence
// to be added to remove inappropriate matches. One idea would be to use a dictionary,
Expand Down