forked from G-Research/geras
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request G-Research#49 from G-Research/regexpopt
Regexp "optimiser", use to implement basic NRE support
- Loading branch information
Showing
4 changed files
with
242 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
// Package regexputil implements some introspection on parsed regexps. | ||
package regexputil | ||
|
||
import ( | ||
"regexp/syntax" | ||
) | ||
|
||
// Regexp represents a parsed regexp. Use Parse to make one. | ||
type Regexp struct { | ||
pt *syntax.Regexp | ||
} | ||
|
||
// Parse takes a regexp in Perl syntax (as implemented by regexp/syntax) and | ||
// returns a Regexp, for introspecting the regexp. | ||
func Parse(regexp string) (Regexp, error) { | ||
pt, err := syntax.Parse(regexp, syntax.Perl) | ||
if err != nil { | ||
return Regexp{}, err | ||
} | ||
pt = pt.Simplify() | ||
return Regexp{pt: pt}, nil | ||
} | ||
|
||
// List returns a list of fixed matches if the regexp only matches a fixed set | ||
// of alternate strings. | ||
func (r Regexp) List() ([]string, bool) { | ||
potential := r.recurse([]*syntax.Regexp{r.pt}, 0, 0) | ||
if len(potential) == 0 { | ||
return nil, false | ||
} | ||
items := make([]string, len(potential)) | ||
for i, p := range potential { | ||
items[i] = string(p) | ||
} | ||
return items, true | ||
} | ||
|
||
func (r Regexp) recurse(p []*syntax.Regexp, parentOp syntax.Op, level int) [][]rune { | ||
var potential [][]rune | ||
// Concat, Capture, Alternate, (a leaf op) is the most we handle | ||
if level > 3 { | ||
return nil | ||
} | ||
for i, s := range p { | ||
// Ignore (?i), etc. | ||
if (s.Flags & (syntax.FoldCase | syntax.DotNL)) != 0 { | ||
return nil | ||
} | ||
switch s.Op { | ||
case syntax.OpConcat: | ||
if len(potential) != 0 { | ||
return nil | ||
} | ||
potential = r.recurse(s.Sub, s.Op, level+1) | ||
case syntax.OpCapture: | ||
if len(potential) != 0 { | ||
return nil | ||
} | ||
potential = r.recurse(s.Sub, s.Op, level+1) | ||
case syntax.OpAlternate: | ||
if len(potential) != 0 { | ||
return nil | ||
} | ||
potential = r.recurse(s.Sub, s.Op, level+1) | ||
case syntax.OpCharClass: | ||
if len(potential) > 0 && parentOp != syntax.OpAlternate { | ||
return nil | ||
} | ||
// Rune is a list of pairs of character ranges in this case, we have to expand | ||
for i := 0; i < len(s.Rune); i += 2 { | ||
start, end := s.Rune[i], s.Rune[i+1] | ||
for r := start; r <= end; r++ { | ||
potential = append(potential, []rune{r}) | ||
} | ||
} | ||
case syntax.OpLiteral: | ||
if len(potential) > 0 && parentOp != syntax.OpAlternate { | ||
return nil | ||
} | ||
potential = append(potential, s.Rune) | ||
case syntax.OpEmptyMatch: | ||
if len(potential) > 0 && parentOp != syntax.OpAlternate { | ||
return nil | ||
} | ||
potential = append(potential, []rune{}) | ||
// We only handle full matches on single lines as that's what Prometheus uses. | ||
// ^ and $ are therefore meaningless, but people do use them, so ignore if in the correct place. | ||
case syntax.OpBeginText: | ||
if i != 0 { | ||
// invalid, skip | ||
return nil | ||
} | ||
case syntax.OpEndText: | ||
if i != len(p)-1 { | ||
// invalid, skip | ||
return nil | ||
} | ||
default: | ||
return nil // unknown op, can't do anything | ||
} | ||
} | ||
return potential | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
package regexputil_test | ||
|
||
import ( | ||
"testing" | ||
|
||
"github.com/G-Research/geras/pkg/regexputil" | ||
) | ||
|
||
func TestList(t *testing.T) { | ||
for _, r := range []struct { | ||
re string | ||
err bool | ||
ok bool | ||
expect []string | ||
}{ | ||
// Normal cases we expect to handle | ||
{"", false, true, []string{""}}, | ||
{"xx|yy", false, true, []string{"xx", "yy"}}, | ||
{"xx|yy", false, true, []string{"xx", "yy"}}, | ||
{"(xx|yy)", false, true, []string{"xx", "yy"}}, | ||
{"(?:xx|yy)", false, true, []string{"xx", "yy"}}, | ||
{"^(?:xx|yy)$", false, true, []string{"xx", "yy"}}, | ||
{"^(xx|yy)$", false, true, []string{"xx", "yy"}}, | ||
{"^(xx|yy)", false, true, []string{"xx", "yy"}}, | ||
{"^(xx|yy)", false, true, []string{"xx", "yy"}}, | ||
// Handled as CharClasses instead of Literals, so test explicitly. | ||
{"x|y|z", false, true, []string{"x", "y", "z"}}, | ||
{"([ab])", false, true, []string{"a", "b"}}, | ||
{"[a-f]", false, true, []string{"a", "b", "c", "d", "e", "f"}}, | ||
|
||
// We don't handle some aspect | ||
{"(^xx|^yy)", false, false, nil}, // Would be easy, but who writes regexps like that anyway. | ||
{"^$", false, false, nil}, // Better BeginText/EndText handling could fix this too, probably not worth it. | ||
{"^(?i:xx|yy)$", false, false, nil}, | ||
{"(xx|yy.)", false, false, nil}, | ||
{"(xx|yy.*)", false, false, nil}, | ||
{"(xx|yy).", false, false, nil}, | ||
{"(xx|yy).*", false, false, nil}, | ||
{"(xx|yy)*", false, false, nil}, | ||
{".", false, false, nil}, | ||
} { | ||
p, err := regexputil.Parse(r.re) | ||
if err != nil { | ||
if !r.err { | ||
t.Errorf("%q: got err, want !err", r.re) | ||
} | ||
continue | ||
} | ||
if r.err { | ||
t.Errorf("%q: got !err, want err", r.re) | ||
} | ||
l, ok := p.List() | ||
if ok != r.ok { | ||
t.Errorf("%q: got %v, want %v", r.re, ok, r.ok) | ||
} | ||
if len(l) != len(r.expect) { | ||
t.Errorf("%q: got %d items, want %d", r.re, len(l), len(r.expect)) | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters