From 65f8bb73978f9abca8ae01e9926d7466bd8d2b39 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 12 May 2023 12:24:59 +0530
Subject: [PATCH] hints kitten: Switch to using a regex engine that supports
 lookaround

Note that we loose unicode char matching for --type=word because of
https://github.com/dlclark/regexp2/issues/65 and of course user regexps
cant use \p{N} escapes any more. Hopefully regexp2 will add support for
these soon-ish. IMO lookaround is more important than \p.

Fixes #6265
---
 docs/changelog.rst          |   2 +
 go.mod                      |   2 +-
 kittens/hints/marks.go      | 142 +++++++++++++++++++++++++++++++-----
 kittens/hints/marks_test.go |   8 ++
 4 files changed, 135 insertions(+), 19 deletions(-)

diff --git a/docs/changelog.rst b/docs/changelog.rst
index 6abf86eebc5..b76d0d88c43 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -52,6 +52,8 @@ Detailed list of changes
 
 - hints kitten: Fix a regression in 0.28.0 that broke using sub-groups in regexp captures (:iss:`6228`)
 
+- hints kitten: Fix a regression in 0.28.0 that broke using lookahead/lookbehind in regexp captures (:iss:`6265`)
+
 - diff kitten: Fix a regression in 0.28.0 that broke using relative paths as arguments to the kitten (:iss:`6325`)
 
 - Fix re-using the image id of an animated image for a still image causing a crash (:iss:`6244`)
diff --git a/go.mod b/go.mod
index 8a13188bd1a..c57996f0441 100644
--- a/go.mod
+++ b/go.mod
@@ -7,6 +7,7 @@ require (
 	github.com/alecthomas/chroma/v2 v2.7.0
 	github.com/bmatcuk/doublestar/v4 v4.6.0
 	github.com/disintegration/imaging v1.6.2
+	github.com/dlclark/regexp2 v1.9.0
 	github.com/google/go-cmp v0.5.9
 	github.com/google/uuid v1.3.0
 	github.com/jamesruan/go-rfc1924 v0.0.0-20170108144916-2767ca7c638f
@@ -18,7 +19,6 @@ require (
 )
 
 require (
-	github.com/dlclark/regexp2 v1.9.0 // indirect
 	github.com/go-ole/go-ole v1.2.6 // indirect
 	github.com/lufia/plan9stats v0.0.0-20230326075908-cb1d2100619a // indirect
 	github.com/power-devops/perfstat v0.0.0-20221212215047-62379fc7944b // indirect
diff --git a/kittens/hints/marks.go b/kittens/hints/marks.go
index f61cc927613..291d4a8466f 100644
--- a/kittens/hints/marks.go
+++ b/kittens/hints/marks.go
@@ -13,9 +13,11 @@ import (
 	"os/exec"
 	"path/filepath"
 	"regexp"
+	"strconv"
 	"strings"
 	"unicode/utf8"
 
+	"github.com/dlclark/regexp2"
 	"github.com/seancfoley/ipaddress-go/ipaddr"
 	"golang.org/x/exp/slices"
 )
@@ -257,9 +259,9 @@ func functions_for(opts *Options) (pattern string, post_processors []PostProcess
 		if chars == "" {
 			chars = RelevantKittyOpts().Select_by_word_characters
 		}
-		chars = regexp.QuoteMeta(chars)
+		chars = regexp2.Escape(chars)
 		chars = strings.ReplaceAll(chars, "-", "\\-")
-		pattern = fmt.Sprintf(`[%s\pL\pN]{%d,}`, chars, opts.MinimumMatchLength)
+		pattern = fmt.Sprintf(`(?u)[%s\w\d]{%d,}`, chars, opts.MinimumMatchLength)
 		post_processors = append(post_processors, PostProcessorMap()["brackets"], PostProcessorMap()["quotes"])
 	default:
 		pattern = opts.Regex
@@ -274,11 +276,112 @@ func functions_for(opts *Options) (pattern string, post_processors []PostProcess
 	return
 }
 
-func mark(r *regexp.Regexp, post_processors []PostProcessorFunc, group_processors []GroupProcessorFunc, text string, opts *Options) (ans []Mark) {
+type Capture struct {
+	Text          string
+	Text_as_runes []rune
+	Byte_Offsets  struct {
+		Start, End int
+	}
+	Rune_Offsets struct {
+		Start, End int
+	}
+}
+
+func (self Capture) String() string {
+	return fmt.Sprintf("Capture(start=%d, end=%d, %#v)", self.Byte_Offsets.Start, self.Byte_Offsets.End, self.Text)
+}
+
+type Group struct {
+	Name     string
+	IsNamed  bool
+	Captures []Capture
+}
+
+func (self Group) LastCapture() Capture {
+	if len(self.Captures) == 0 {
+		return Capture{}
+	}
+	return self.Captures[len(self.Captures)-1]
+}
+
+func (self Group) String() string {
+	return fmt.Sprintf("Group(name=%#v, captures=%v)", self.Name, self.Captures)
+}
+
+type Match struct {
+	Groups []Group
+}
+
+func (self Match) HasNamedGroups() bool {
+	for _, g := range self.Groups {
+		if g.IsNamed {
+			return true
+		}
+	}
+	return false
+}
+
+func find_all_matches(re *regexp2.Regexp, text string) (ans []Match, err error) {
+	m, err := re.FindStringMatch(text)
+	if err != nil {
+		return
+	}
+	rune_to_bytes := utils.RuneOffsetsToByteOffsets(text)
+	get_byte_offset_map := func(groups []regexp2.Group) (ans map[int]int, err error) {
+		ans = make(map[int]int, len(groups)*2)
+		rune_offsets := make([]int, 0, len(groups)*2)
+		for _, g := range groups {
+			for _, c := range g.Captures {
+				if _, found := ans[c.Index]; !found {
+					rune_offsets = append(rune_offsets, c.Index)
+					ans[c.Index] = -1
+				}
+				end := c.Index + c.Length
+				if _, found := ans[end]; !found {
+					rune_offsets = append(rune_offsets, end)
+					ans[end] = -1
+				}
+			}
+		}
+		slices.Sort(rune_offsets)
+		for _, pos := range rune_offsets {
+			if ans[pos] = rune_to_bytes(pos); ans[pos] < 0 {
+				return nil, fmt.Errorf("Matches are not monotonic cannot map rune offsets to byte offsets")
+			}
+		}
+		return
+	}
+
+	for m != nil {
+		groups := m.Groups()
+		bom, err := get_byte_offset_map(groups)
+		if err != nil {
+			return nil, err
+		}
+		match := Match{Groups: make([]Group, len(groups))}
+		for i, g := range m.Groups() {
+			match.Groups[i].Name = g.Name
+			match.Groups[i].IsNamed = g.Name != "" && g.Name != strconv.Itoa(i)
+			for _, c := range g.Captures {
+				cn := Capture{Text: c.String(), Text_as_runes: c.Runes()}
+				cn.Rune_Offsets.End = c.Index + c.Length
+				cn.Rune_Offsets.Start = c.Index
+				cn.Byte_Offsets.Start, cn.Byte_Offsets.End = bom[c.Index], bom[cn.Rune_Offsets.End]
+				match.Groups[i].Captures = append(match.Groups[i].Captures, cn)
+			}
+		}
+		ans = append(ans, match)
+		m, _ = re.FindNextMatch(m)
+	}
+	return
+}
+
+func mark(r *regexp2.Regexp, post_processors []PostProcessorFunc, group_processors []GroupProcessorFunc, text string, opts *Options) (ans []Mark) {
 	sanitize_pat := regexp.MustCompile("[\r\n\x00]")
-	names := r.SubexpNames()
-	for i, v := range r.FindAllStringSubmatchIndex(text, -1) {
-		match_start, match_end := v[0], v[1]
+	all_matches, _ := find_all_matches(r, text)
+	for i, m := range all_matches {
+		full_capture := m.Groups[0].LastCapture()
+		match_start, match_end := full_capture.Byte_Offsets.Start, full_capture.Byte_Offsets.End
 		for match_end > match_start+1 && text[match_end-1] == 0 {
 			match_end--
 		}
@@ -296,14 +399,14 @@ func mark(r *regexp.Regexp, post_processors []PostProcessorFunc, group_processor
 			continue
 		}
 		full_match = sanitize_pat.ReplaceAllLiteralString(text[match_start:match_end], "")
-		gd := make(map[string]string, len(names))
-		for x, name := range names {
-			if name != "" {
-				idx := 2 * x
-				if s, e := v[idx], v[idx+1]; s > -1 && e > -1 {
+		gd := make(map[string]string, len(m.Groups))
+		for idx, g := range m.Groups {
+			if idx > 0 && g.IsNamed {
+				c := g.LastCapture()
+				if s, e := c.Byte_Offsets.Start, c.Byte_Offsets.End; s > -1 && e > -1 {
 					s = utils.Max(s, match_start)
 					e = utils.Min(e, match_end)
-					gd[name] = sanitize_pat.ReplaceAllLiteralString(text[s:e], "")
+					gd[g.Name] = sanitize_pat.ReplaceAllLiteralString(text[s:e], "")
 				}
 			}
 		}
@@ -314,15 +417,18 @@ func mark(r *regexp.Regexp, post_processors []PostProcessorFunc, group_processor
 		for k, v := range gd {
 			gd2[k] = v
 		}
-		if opts.Type == "regex" && len(names) > 1 && names[1] == "" {
-			ms, me := v[2], v[3]
+		if opts.Type == "regex" && len(m.Groups) > 1 && !m.HasNamedGroups() {
+			cp := m.Groups[1].LastCapture()
+			ms, me := cp.Byte_Offsets.Start, cp.Byte_Offsets.End
 			match_start = utils.Max(match_start, ms)
 			match_end = utils.Min(match_end, me)
 			full_match = sanitize_pat.ReplaceAllLiteralString(text[match_start:match_end], "")
 		}
-		ans = append(ans, Mark{
-			Index: i, Start: match_start, End: match_end, Text: full_match, Groupdict: gd2,
-		})
+		if full_match != "" {
+			ans = append(ans, Mark{
+				Index: i, Start: match_start, End: match_end, Text: full_match, Groupdict: gd2,
+			})
+		}
 	}
 	return
 }
@@ -362,7 +468,7 @@ func find_marks(text string, opts *Options, cli_args ...string) (sanitized_text
 
 	run_basic_matching := func() error {
 		pattern, post_processors, group_processors := functions_for(opts)
-		r, err := regexp.Compile(pattern)
+		r, err := regexp2.Compile(pattern, regexp2.RE2)
 		if err != nil {
 			return fmt.Errorf("Failed to compile the regex pattern: %#v with error: %w", pattern, err)
 		}
diff --git a/kittens/hints/marks_test.go b/kittens/hints/marks_test.go
index f92234971d0..c5381463d5c 100644
--- a/kittens/hints/marks_test.go
+++ b/kittens/hints/marks_test.go
@@ -111,6 +111,14 @@ func TestHintMarking(t *testing.T) {
 	opts.Type = "regex"
 	opts.Regex = `(?ms)^[*]?\s(\S+)`
 	r(`* 2b687c2 - test1`, `2b687c2`)
+	opts.Regex = `(?<=got:    )sha256.{4}`
+	r(`got:    sha256-L8=`, `sha256-L8=`)
+
+	reset()
+	opts.Type = "word"
+	r(`#one (two) 😍 a-1b `, `#one`, `two`, `a-1b`)
+	// non-ascii words dont match because of https://github.com/dlclark/regexp2/issues/65
+	// r(`fōtiz час`, `fōtiz`, `час`)
 
 	reset()
 	tdir := t.TempDir()