Skip to content

Commit

Permalink
chore(bloom): support simplifiable regexp matchers
Browse files Browse the repository at this point in the history
This adds support for basic regexps which can be simplified into a
sequence of OR matchers, such as:

* `key=~"value"` becomes `key="value"`
* `key=~"value1|value2"` becomes `key="value1" or key="value2"`.

Matchers like `key=~".+"` continue to not be supported because the lack
of a key doesn't mean that it doesn't exist as a label.

Only the cases above are "officially" supported. However, we technically
support basic concatenations and character classes due to how
regexp/syntax parses and simplifies expressions such as `value1|value2`
into `value[12]`.

To prevent unbounded cardinality, we limit regexp expansion to 25
matchers; otherwise a regexp like `value[0-9][0-9][0-9][0-9]` would
expand into 10,000 matchers (too many!).
  • Loading branch information
rfratto committed Oct 25, 2024
1 parent 5824e3d commit 89ad662
Show file tree
Hide file tree
Showing 3 changed files with 269 additions and 7 deletions.
4 changes: 4 additions & 0 deletions docs/sources/query/query_accceleration.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ If [bloom filters][] are enabled, you can write LogQL queries using [structured
Queries will be accelerated for any [label filter expression][] that satisfies _all_ of the following criteria:

* The label filter expression using **string equality**, such as `| key="value"`.
* `or` and `and` operators can be used to match multiple values, such as `| detected_level="error" or detected_level="warn"`.
* _Basic_ regular expresions are automatically simplified into a supported expression:
* `| key=~"value"` is converted to `| key="value"`.
* `| key=~"value1|value2"` is converted to `| key="value1" or key="value2"`.
* The label filter expression is querying for structured metadata and not a stream label.
* The label filter expression is placed before any [parser expression][], [labels format expression][], [drop labels expression][], or [keep labels expression][].

Expand Down
172 changes: 166 additions & 6 deletions pkg/storage/bloom/v1/ast_extractor.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,22 @@
package v1

import (
regexsyn "github.com/grafana/regexp/syntax"

"github.com/prometheus/prometheus/model/labels"

"github.com/grafana/loki/v3/pkg/logql/log"
"github.com/grafana/loki/v3/pkg/logql/syntax"
"github.com/grafana/loki/v3/pkg/util"
)

// Simplifiable regexp expressions can quickly expand into very high
// cardinality; we limit the number of matchers to prevent this.
//
// For example, the regex `[0-9]` expands to 10 matchers (0, 1, .. 9), while
// `[0-9][0-9]` expands to 100 matchers (00, 01, .., 99).
const maxRegexMatchers = 25

// LabelMatcher represents bloom tests for key-value pairs, mapped from
// LabelFilterExprs from the AST.
type LabelMatcher interface{ isLabelMatcher() }
Expand Down Expand Up @@ -54,14 +64,20 @@ func buildLabelMatcher(filter log.LabelFilterer) LabelMatcher {
switch filter := filter.(type) {

case *log.LineFilterLabelFilter:
if filter.Type != labels.MatchEqual {
return UnsupportedLabelMatcher{}
if filter.Type == labels.MatchEqual {
return PlainLabelMatcher{
Key: filter.Name,
Value: filter.Value,
}
} else if filter.Type == labels.MatchRegexp {
reg, err := regexsyn.Parse(filter.Value, regexsyn.Perl)
if err != nil {
return UnsupportedLabelMatcher{}
}
return buildSimplifiedRegexMatcher(filter.Name, reg.Simplify())
}

return PlainLabelMatcher{
Key: filter.Name,
Value: filter.Value,
}
return UnsupportedLabelMatcher{}

case *log.StringLabelFilter:
if filter.Type != labels.MatchEqual {
Expand Down Expand Up @@ -89,6 +105,150 @@ func buildLabelMatcher(filter log.LabelFilterer) LabelMatcher {
}
}

// buildSimplifiedRegexMatcher builds a simplified label matcher from a regex.
// reg may be mutated.
func buildSimplifiedRegexMatcher(key string, reg *regexsyn.Regexp) LabelMatcher {
switch reg.Op {
case regexsyn.OpAlternate:
util.ClearCapture(reg)

left := buildSimplifiedRegexMatcher(key, reg.Sub[0])
for _, sub := range reg.Sub[1:] {
right := buildSimplifiedRegexMatcher(key, sub)
left = OrLabelMatcher{Left: left, Right: right}
}
return left

case regexsyn.OpConcat:
// OpConcat checks for the concatenation of two or more subexpressions. For
// example, value1|value2 simplifies to value[12], with the two
// subexpressions value and [12].
//
// We expand subexpressions back out into full matchers where possible, so
// value[12] becomes value1 OR value2, and value[1-9] becomes value1 OR
// value2 .. OR value9.
util.ClearCapture(reg)

matchers, ok := expandSubexpr(reg)
if !ok || len(matchers) == 0 {
return UnsupportedLabelMatcher{}
}

var left LabelMatcher = PlainLabelMatcher{Key: key, Value: matchers[0]}
for _, matcher := range matchers[1:] {
right := PlainLabelMatcher{Key: key, Value: matcher}
left = OrLabelMatcher{Left: left, Right: right}
}
return left

case regexsyn.OpCapture:
util.ClearCapture(reg)
return buildSimplifiedRegexMatcher(key, reg)

case regexsyn.OpLiteral:
return PlainLabelMatcher{
Key: key,
Value: string(reg.Rune),
}

default:
return UnsupportedLabelMatcher{}
}
}

func expandSubexpr(reg *regexsyn.Regexp) (prefixes []string, ok bool) {
switch reg.Op {
case regexsyn.OpAlternate:
util.ClearCapture(reg)

for _, sub := range reg.Sub {
subPrefixes, ok := expandSubexpr(sub)
if !ok {
return nil, false
} else if len(prefixes)+len(subPrefixes) > maxRegexMatchers {
return nil, false
}
prefixes = append(prefixes, subPrefixes...)
}
return prefixes, true

case regexsyn.OpCharClass:
// OpCharClass stores ranges of characters, so [12] is the range of bytes
// []rune('1', '2'), while [15] is represented as []rune('1', '1', '5',
// '5').
//
// To expand OpCharClass, we iterate over each pair of runes.
if len(reg.Rune)%2 != 0 {
// Invalid regexp; sequences should be even.
return nil, false
}

for i := 0; i < len(reg.Rune); i += 2 {
start, end := reg.Rune[i+0], reg.Rune[i+1]
for r := start; r <= end; r++ {
prefixes = append(prefixes, string(r))
if len(prefixes) > maxRegexMatchers {
return nil, false
}
}
}

return prefixes, true

case regexsyn.OpConcat:
if len(reg.Sub) == 0 {
return nil, false
}

// We get the prefixes for each subexpression and then iteratively combine
// them together.
//
// For the regexp [12][34]value (which concatenates [12], [34], and value):
//
// 1. We get the prefixes for [12], which are 1 and 2.
// 2. We get the prefixes for [34], which are 3 and 4.
// 3. We add the prefixes together to get 13, 14, 23, and 24.
// 4. We get the prerfixes for value, which is value.
// 5. Finally, we add the prefixes together to get 13value, 14value, 23value, and 24value.
curPrefixes, ok := expandSubexpr(reg.Sub[0])
if !ok {
return nil, false
}

for _, sub := range reg.Sub[1:] {
subPrefixes, ok := expandSubexpr(sub)
if !ok {
return nil, false
} else if len(curPrefixes)*len(subPrefixes) > maxRegexMatchers {
return nil, false
}

newPrefixes := make([]string, 0, len(curPrefixes)*len(subPrefixes))

for _, curPrefix := range curPrefixes {
for _, subPrefix := range subPrefixes {
newPrefixes = append(newPrefixes, curPrefix+subPrefix)
}
}

curPrefixes = newPrefixes
}

return curPrefixes, true

case regexsyn.OpCapture:
util.ClearCapture(reg)
return expandSubexpr(reg)

case regexsyn.OpLiteral:
prefixes = append(prefixes, string(reg.Rune))
return prefixes, true

default:
return nil, false
}
}

//
// Implement marker types:
//
Expand Down
100 changes: 99 additions & 1 deletion pkg/storage/bloom/v1/ast_extractor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,89 @@ func TestExtractLabelMatchers(t *testing.T) {
},

{
name: "unsupported label matchers",
name: "basic regex matcher",
input: `{app="foo"} | key1=~"value1"`,
expect: []v1.LabelMatcher{
v1.PlainLabelMatcher{Key: "key1", Value: "value1"},
},
},

{
name: "regex matcher short", // simplifies to value[15].
input: `{app="foo"} | key1=~"value1|value5"`,
expect: []v1.LabelMatcher{
v1.OrLabelMatcher{
v1.PlainLabelMatcher{Key: "key1", Value: "value1"},
v1.PlainLabelMatcher{Key: "key1", Value: "value5"},
},
},
},

{
name: "regex matcher range",
input: `{app="foo"} | key1=~"value[0-9]"`,
expect: []v1.LabelMatcher{
buildOrMatchers(
v1.PlainLabelMatcher{Key: "key1", Value: "value0"},
v1.PlainLabelMatcher{Key: "key1", Value: "value1"},
v1.PlainLabelMatcher{Key: "key1", Value: "value2"},
v1.PlainLabelMatcher{Key: "key1", Value: "value3"},
v1.PlainLabelMatcher{Key: "key1", Value: "value4"},
v1.PlainLabelMatcher{Key: "key1", Value: "value5"},
v1.PlainLabelMatcher{Key: "key1", Value: "value6"},
v1.PlainLabelMatcher{Key: "key1", Value: "value7"},
v1.PlainLabelMatcher{Key: "key1", Value: "value8"},
v1.PlainLabelMatcher{Key: "key1", Value: "value9"},
),
},
},

{
name: "regex matcher ignore high cardinality",
input: `{app="foo"} | key1=~"value[0-9][0-9]"`, // This would expand to 100 matchers. Too many!
expect: []v1.LabelMatcher{
v1.UnsupportedLabelMatcher{},
},
},

{
name: "regex matcher",
input: `{app="foo"} | key1=~"value123|value456"`,
expect: []v1.LabelMatcher{
v1.OrLabelMatcher{
v1.PlainLabelMatcher{Key: "key1", Value: "value123"},
v1.PlainLabelMatcher{Key: "key1", Value: "value456"},
},
},
},

{
name: "regex multiple expands",
input: `{app="foo"} | detected_level=~"debug|info|warn|error"`,
expect: []v1.LabelMatcher{
buildOrMatchers(
v1.PlainLabelMatcher{Key: "detected_level", Value: "debug"},
v1.PlainLabelMatcher{Key: "detected_level", Value: "info"},
v1.PlainLabelMatcher{Key: "detected_level", Value: "warn"},
v1.PlainLabelMatcher{Key: "detected_level", Value: "error"},
),
},
},

{
name: "regex matcher with ignored capture groups",
input: `{app="foo"} | key1=~"value1|(value2)"`,
expect: []v1.LabelMatcher{
v1.OrLabelMatcher{
v1.PlainLabelMatcher{Key: "key1", Value: "value1"},
v1.PlainLabelMatcher{Key: "key1", Value: "value2"},
},
},
},

{
name: "unsupported label matchers",
input: `{app="foo"} | key1!="value1"`,
expect: []v1.LabelMatcher{
v1.UnsupportedLabelMatcher{},
},
Expand All @@ -73,6 +154,23 @@ func TestExtractLabelMatchers(t *testing.T) {
}
}

func buildOrMatchers(matchers ...v1.LabelMatcher) v1.LabelMatcher {
if len(matchers) == 1 {
return matchers[0]
}

left := matchers[0]

for _, right := range matchers[1:] {
left = v1.OrLabelMatcher{
Left: left,
Right: right,
}
}

return left
}

func TestExtractLabelMatchers_IgnoreAfterParse(t *testing.T) {
tt := []struct {
name string
Expand Down

0 comments on commit 89ad662

Please sign in to comment.