From 65f8bb73978f9abca8ae01e9926d7466bd8d2b39 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 12 May 2023 12:24:59 +0530 Subject: [PATCH] hints kitten: Switch to using a regex engine that supports lookaround Note that we loose unicode char matching for --type=word because of https://github.com/dlclark/regexp2/issues/65 and of course user regexps cant use \p{N} escapes any more. Hopefully regexp2 will add support for these soon-ish. IMO lookaround is more important than \p. Fixes #6265 --- docs/changelog.rst | 2 + go.mod | 2 +- kittens/hints/marks.go | 142 +++++++++++++++++++++++++++++++----- kittens/hints/marks_test.go | 8 ++ 4 files changed, 135 insertions(+), 19 deletions(-) diff --git a/docs/changelog.rst b/docs/changelog.rst index 6abf86eebc5..b76d0d88c43 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -52,6 +52,8 @@ Detailed list of changes - hints kitten: Fix a regression in 0.28.0 that broke using sub-groups in regexp captures (:iss:`6228`) +- hints kitten: Fix a regression in 0.28.0 that broke using lookahead/lookbehind in regexp captures (:iss:`6265`) + - diff kitten: Fix a regression in 0.28.0 that broke using relative paths as arguments to the kitten (:iss:`6325`) - Fix re-using the image id of an animated image for a still image causing a crash (:iss:`6244`) diff --git a/go.mod b/go.mod index 8a13188bd1a..c57996f0441 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/alecthomas/chroma/v2 v2.7.0 github.com/bmatcuk/doublestar/v4 v4.6.0 github.com/disintegration/imaging v1.6.2 + github.com/dlclark/regexp2 v1.9.0 github.com/google/go-cmp v0.5.9 github.com/google/uuid v1.3.0 github.com/jamesruan/go-rfc1924 v0.0.0-20170108144916-2767ca7c638f @@ -18,7 +19,6 @@ require ( ) require ( - github.com/dlclark/regexp2 v1.9.0 // indirect github.com/go-ole/go-ole v1.2.6 // indirect github.com/lufia/plan9stats v0.0.0-20230326075908-cb1d2100619a // indirect github.com/power-devops/perfstat v0.0.0-20221212215047-62379fc7944b // indirect diff --git a/kittens/hints/marks.go b/kittens/hints/marks.go index f61cc927613..291d4a8466f 100644 --- a/kittens/hints/marks.go +++ b/kittens/hints/marks.go @@ -13,9 +13,11 @@ import ( "os/exec" "path/filepath" "regexp" + "strconv" "strings" "unicode/utf8" + "github.com/dlclark/regexp2" "github.com/seancfoley/ipaddress-go/ipaddr" "golang.org/x/exp/slices" ) @@ -257,9 +259,9 @@ func functions_for(opts *Options) (pattern string, post_processors []PostProcess if chars == "" { chars = RelevantKittyOpts().Select_by_word_characters } - chars = regexp.QuoteMeta(chars) + chars = regexp2.Escape(chars) chars = strings.ReplaceAll(chars, "-", "\\-") - pattern = fmt.Sprintf(`[%s\pL\pN]{%d,}`, chars, opts.MinimumMatchLength) + pattern = fmt.Sprintf(`(?u)[%s\w\d]{%d,}`, chars, opts.MinimumMatchLength) post_processors = append(post_processors, PostProcessorMap()["brackets"], PostProcessorMap()["quotes"]) default: pattern = opts.Regex @@ -274,11 +276,112 @@ func functions_for(opts *Options) (pattern string, post_processors []PostProcess return } -func mark(r *regexp.Regexp, post_processors []PostProcessorFunc, group_processors []GroupProcessorFunc, text string, opts *Options) (ans []Mark) { +type Capture struct { + Text string + Text_as_runes []rune + Byte_Offsets struct { + Start, End int + } + Rune_Offsets struct { + Start, End int + } +} + +func (self Capture) String() string { + return fmt.Sprintf("Capture(start=%d, end=%d, %#v)", self.Byte_Offsets.Start, self.Byte_Offsets.End, self.Text) +} + +type Group struct { + Name string + IsNamed bool + Captures []Capture +} + +func (self Group) LastCapture() Capture { + if len(self.Captures) == 0 { + return Capture{} + } + return self.Captures[len(self.Captures)-1] +} + +func (self Group) String() string { + return fmt.Sprintf("Group(name=%#v, captures=%v)", self.Name, self.Captures) +} + +type Match struct { + Groups []Group +} + +func (self Match) HasNamedGroups() bool { + for _, g := range self.Groups { + if g.IsNamed { + return true + } + } + return false +} + +func find_all_matches(re *regexp2.Regexp, text string) (ans []Match, err error) { + m, err := re.FindStringMatch(text) + if err != nil { + return + } + rune_to_bytes := utils.RuneOffsetsToByteOffsets(text) + get_byte_offset_map := func(groups []regexp2.Group) (ans map[int]int, err error) { + ans = make(map[int]int, len(groups)*2) + rune_offsets := make([]int, 0, len(groups)*2) + for _, g := range groups { + for _, c := range g.Captures { + if _, found := ans[c.Index]; !found { + rune_offsets = append(rune_offsets, c.Index) + ans[c.Index] = -1 + } + end := c.Index + c.Length + if _, found := ans[end]; !found { + rune_offsets = append(rune_offsets, end) + ans[end] = -1 + } + } + } + slices.Sort(rune_offsets) + for _, pos := range rune_offsets { + if ans[pos] = rune_to_bytes(pos); ans[pos] < 0 { + return nil, fmt.Errorf("Matches are not monotonic cannot map rune offsets to byte offsets") + } + } + return + } + + for m != nil { + groups := m.Groups() + bom, err := get_byte_offset_map(groups) + if err != nil { + return nil, err + } + match := Match{Groups: make([]Group, len(groups))} + for i, g := range m.Groups() { + match.Groups[i].Name = g.Name + match.Groups[i].IsNamed = g.Name != "" && g.Name != strconv.Itoa(i) + for _, c := range g.Captures { + cn := Capture{Text: c.String(), Text_as_runes: c.Runes()} + cn.Rune_Offsets.End = c.Index + c.Length + cn.Rune_Offsets.Start = c.Index + cn.Byte_Offsets.Start, cn.Byte_Offsets.End = bom[c.Index], bom[cn.Rune_Offsets.End] + match.Groups[i].Captures = append(match.Groups[i].Captures, cn) + } + } + ans = append(ans, match) + m, _ = re.FindNextMatch(m) + } + return +} + +func mark(r *regexp2.Regexp, post_processors []PostProcessorFunc, group_processors []GroupProcessorFunc, text string, opts *Options) (ans []Mark) { sanitize_pat := regexp.MustCompile("[\r\n\x00]") - names := r.SubexpNames() - for i, v := range r.FindAllStringSubmatchIndex(text, -1) { - match_start, match_end := v[0], v[1] + all_matches, _ := find_all_matches(r, text) + for i, m := range all_matches { + full_capture := m.Groups[0].LastCapture() + match_start, match_end := full_capture.Byte_Offsets.Start, full_capture.Byte_Offsets.End for match_end > match_start+1 && text[match_end-1] == 0 { match_end-- } @@ -296,14 +399,14 @@ func mark(r *regexp.Regexp, post_processors []PostProcessorFunc, group_processor continue } full_match = sanitize_pat.ReplaceAllLiteralString(text[match_start:match_end], "") - gd := make(map[string]string, len(names)) - for x, name := range names { - if name != "" { - idx := 2 * x - if s, e := v[idx], v[idx+1]; s > -1 && e > -1 { + gd := make(map[string]string, len(m.Groups)) + for idx, g := range m.Groups { + if idx > 0 && g.IsNamed { + c := g.LastCapture() + if s, e := c.Byte_Offsets.Start, c.Byte_Offsets.End; s > -1 && e > -1 { s = utils.Max(s, match_start) e = utils.Min(e, match_end) - gd[name] = sanitize_pat.ReplaceAllLiteralString(text[s:e], "") + gd[g.Name] = sanitize_pat.ReplaceAllLiteralString(text[s:e], "") } } } @@ -314,15 +417,18 @@ func mark(r *regexp.Regexp, post_processors []PostProcessorFunc, group_processor for k, v := range gd { gd2[k] = v } - if opts.Type == "regex" && len(names) > 1 && names[1] == "" { - ms, me := v[2], v[3] + if opts.Type == "regex" && len(m.Groups) > 1 && !m.HasNamedGroups() { + cp := m.Groups[1].LastCapture() + ms, me := cp.Byte_Offsets.Start, cp.Byte_Offsets.End match_start = utils.Max(match_start, ms) match_end = utils.Min(match_end, me) full_match = sanitize_pat.ReplaceAllLiteralString(text[match_start:match_end], "") } - ans = append(ans, Mark{ - Index: i, Start: match_start, End: match_end, Text: full_match, Groupdict: gd2, - }) + if full_match != "" { + ans = append(ans, Mark{ + Index: i, Start: match_start, End: match_end, Text: full_match, Groupdict: gd2, + }) + } } return } @@ -362,7 +468,7 @@ func find_marks(text string, opts *Options, cli_args ...string) (sanitized_text run_basic_matching := func() error { pattern, post_processors, group_processors := functions_for(opts) - r, err := regexp.Compile(pattern) + r, err := regexp2.Compile(pattern, regexp2.RE2) if err != nil { return fmt.Errorf("Failed to compile the regex pattern: %#v with error: %w", pattern, err) } diff --git a/kittens/hints/marks_test.go b/kittens/hints/marks_test.go index f92234971d0..c5381463d5c 100644 --- a/kittens/hints/marks_test.go +++ b/kittens/hints/marks_test.go @@ -111,6 +111,14 @@ func TestHintMarking(t *testing.T) { opts.Type = "regex" opts.Regex = `(?ms)^[*]?\s(\S+)` r(`* 2b687c2 - test1`, `2b687c2`) + opts.Regex = `(?<=got: )sha256.{4}` + r(`got: sha256-L8=`, `sha256-L8=`) + + reset() + opts.Type = "word" + r(`#one (two) 😍 a-1b `, `#one`, `two`, `a-1b`) + // non-ascii words dont match because of https://github.com/dlclark/regexp2/issues/65 + // r(`fōtiz час`, `fōtiz`, `час`) reset() tdir := t.TempDir()