Skip to content

Commit b71c1a2

Browse files
committed
Extract matched context from markdown tables
* Searches will match content using case insensitive matching. * Matched table cell extracted to avoid being flooded with text in results list
1 parent 5dc9281 commit b71c1a2

File tree

5 files changed

+150
-44
lines changed

5 files changed

+150
-44
lines changed

notebook/search.go

+2-35
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
package notebook
22

33
import (
4-
"github.com/lithammer/fuzzysearch/fuzzy"
54
"github.com/msp301/zb/graph"
65
"github.com/msp301/zb/parser"
76
"github.com/msp301/zb/util"
8-
"regexp"
97
"strings"
108
)
119

@@ -23,7 +21,7 @@ func (book *Notebook) Search(query ...string) []Result {
2321
for _, paragraph := range paragraphs {
2422
termsMatched := 0
2523
for _, term := range query {
26-
if matches(paragraph, term) {
24+
if util.ParagraphMatches(paragraph, term) {
2725
termsMatched++
2826
}
2927
}
@@ -33,7 +31,7 @@ func (book *Notebook) Search(query ...string) []Result {
3331
}
3432

3533
for _, term := range query {
36-
extracted, ok := util.Context(paragraph, term)
34+
extracted, ok := util.ContextFold(paragraph, term)
3735
if ok {
3836
context = append(context, extracted...)
3937
matched = true
@@ -62,34 +60,3 @@ func (book *Notebook) Search(query ...string) []Result {
6260
func extractParagraphs(content string) []string {
6361
return strings.Split(content, "\n\n")
6462
}
65-
66-
func matches(content string, query string) bool {
67-
tokens := strings.Fields(content)
68-
for _, token := range tokens {
69-
if len(query) > 3 && strings.HasPrefix(token, query) {
70-
return true
71-
}
72-
73-
var distance int
74-
hasUppercase := regexp.MustCompile("[A-Z]")
75-
if hasUppercase.MatchString(query) {
76-
distance = fuzzy.RankMatchNormalized(query, token)
77-
} else {
78-
distance = fuzzy.RankMatchNormalizedFold(query, token)
79-
}
80-
81-
if distance == -1 {
82-
continue
83-
}
84-
85-
if distance == 0 {
86-
return true
87-
}
88-
89-
distancePercent := (float64(distance) / float64(len(token))) * 100
90-
if distancePercent < 50 {
91-
return true
92-
}
93-
}
94-
return false
95-
}

util/context.go

+42-9
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,16 @@ import (
55
"strings"
66
)
77

8-
var mdListRegex = regexp.MustCompile(`^(\s*)(?:\*|\+|-|\d+[.)])\s+`);
9-
var mdListEntryRegex = regexp.MustCompile(`^(\s*)(?:(?:\*|\+|-|\d+[.)])\s+)?([^\n]+)`);
8+
var mdListRegex = regexp.MustCompile(`^(\s*)(?:\*|\+|-|\d+[.)])\s+`)
9+
10+
var mdListEntryRegex = regexp.MustCompile(`^(\s*)(?:(?:\*|\+|-|\d+[.)])\s+)?([^\n]+)`)
1011

1112
// cache contextRegex by input phrase
1213
var contextRegexCache = make(map[string]*regexp.Regexp)
1314

14-
func Context(s string, phrase string) ([]string, bool) {
15+
type ContextMatchFunc func(s string, phrase string) bool
16+
17+
func context(s string, phrase string, matchFunc ContextMatchFunc) ([]string, bool) {
1518
contextRegex := contextRegex(phrase)
1619
matches := contextRegex.FindAllStringSubmatch(s, -1)
1720
if matches == nil {
@@ -24,13 +27,27 @@ func Context(s string, phrase string) ([]string, bool) {
2427
match := strings.TrimSpace(match[0])
2528
if isMarkdownList(match) {
2629
for _, line := range strings.Split(match, "\n") {
27-
if strings.Contains(line, phrase) {
30+
if matchFunc(line, phrase) {
2831
context := mdListEntryRegex.FindStringSubmatch(line)
2932
contexts = append(contexts, context[2])
3033
}
3134
}
3235
continue
3336
}
37+
38+
if isMarkdownTable(match) {
39+
for _, row := range strings.Split(match, "\n") {
40+
if matchFunc(row, phrase) {
41+
for _, cell := range strings.Split(row, "|") {
42+
if matchFunc(cell, phrase) {
43+
contexts = append(contexts, strings.TrimSpace(cell))
44+
}
45+
}
46+
}
47+
}
48+
continue
49+
}
50+
3451
contexts = append(contexts, match)
3552
}
3653

@@ -41,11 +58,27 @@ func isMarkdownList(line string) bool {
4158
return mdListRegex.MatchString(line)
4259
}
4360

61+
func isMarkdownTable(line string) bool {
62+
return strings.HasPrefix(line, "|")
63+
}
64+
4465
func contextRegex(phrase string) *regexp.Regexp {
45-
if contextRegexCache[phrase] == nil {
46-
input := regexp.QuoteMeta(phrase)
47-
contextRegexCache[phrase] = regexp.MustCompile(`(?i)(?:[^\n]\n?)*` + input + `(?:[^\n]\n?)*`)
48-
}
66+
if contextRegexCache[phrase] == nil {
67+
input := regexp.QuoteMeta(phrase)
68+
contextRegexCache[phrase] = regexp.MustCompile(`(?i)(?:[^\n]\n?)*` + input + `(?:[^\n]\n?)*`)
69+
}
70+
71+
return contextRegexCache[phrase]
72+
}
73+
74+
func Context(s string, phrase string) ([]string, bool) {
75+
return context(s, phrase, func(s string, t string) bool {
76+
return strings.Contains(s, phrase)
77+
})
78+
}
4979

50-
return contextRegexCache[phrase]
80+
func ContextFold(s string, phrase string) ([]string, bool) {
81+
return context(s, phrase, func(s string, t string) bool {
82+
return strings.Contains(strings.ToLower(s), strings.ToLower(phrase))
83+
})
5184
}

util/context_test.go

+29
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ func TestContext(t *testing.T) {
2929

3030
{" * This is a list entry\n* list about nothing\nor maybe something", "list", []string{"This is a list entry", "list about nothing"}},
3131
{"Example 1\n\nanother example\n\nand another", "another", []string{"another example", "and another"}},
32+
33+
{"|Column A|Column B|\n|------|------|\n|Value foo|Value bar|", "foo", []string{"Value foo"}},
34+
{"| Column A | Column B |\n| ------ | ------ |\n| Value foo | Value bar |", "foo", []string{"Value foo"}},
3235
}
3336

3437
for _, test := range tests {
@@ -41,6 +44,32 @@ func TestContext(t *testing.T) {
4144
t.Fatalf("expected '%s' but was '%s'", test.want, got)
4245
}
4346
})
47+
}
48+
}
49+
50+
func TestContextFold(t *testing.T) {
51+
52+
tests := []struct {
53+
source string
54+
phrase string
55+
want []string
56+
}{
57+
{"|Column A|Column B|\n|------|------|\n|Value foo|Value bar|", "foo", []string{"Value foo"}},
58+
{"| Column A | Column B |\n| ------ | ------ |\n| Value foo | Value bar |", "foo", []string{"Value foo"}},
4459

60+
{"|Column A|Column B|\n|------|------|\n|Value Foo|Value Bar|", "foo", []string{"Value Foo"}},
61+
{"| Column A | Column B |\n| ------ | ------ |\n| Value Foo | Value Bar |", "foo", []string{"Value Foo"}},
62+
}
63+
64+
for _, test := range tests {
65+
t.Run(fmt.Sprintf("ContextFold('%v', '%v')", test.source, test.phrase), func(t *testing.T) {
66+
got, ok := ContextFold(test.source, test.phrase)
67+
if !ok {
68+
t.Fatalf("Expected ok but was not ok")
69+
}
70+
if !reflect.DeepEqual(got, test.want) {
71+
t.Fatalf("expected '%s' but was '%s'", test.want, got)
72+
}
73+
})
4574
}
4675
}

util/paragraph.go

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
package util
2+
3+
import (
4+
"github.com/lithammer/fuzzysearch/fuzzy"
5+
"regexp"
6+
"strings"
7+
)
8+
9+
var noiseRegex = regexp.MustCompile(`[^\s\w#.]`)
10+
11+
func ParagraphMatches(content string, query string) bool {
12+
content = noiseRegex.ReplaceAllString(content, " ")
13+
tokens := strings.Fields(content)
14+
for _, token := range tokens {
15+
if len(query) > 3 && strings.HasPrefix(token, query) {
16+
return true
17+
}
18+
19+
var distance int
20+
hasUppercase := regexp.MustCompile("[A-Z]")
21+
if hasUppercase.MatchString(query) {
22+
distance = fuzzy.RankMatchNormalized(query, token)
23+
} else {
24+
distance = fuzzy.RankMatchNormalizedFold(query, token)
25+
}
26+
27+
if distance == -1 {
28+
continue
29+
}
30+
31+
if distance == 0 {
32+
return true
33+
}
34+
35+
distancePercent := (float64(distance) / float64(len(token))) * 100
36+
thresholdPercent := 50.0
37+
if distancePercent < thresholdPercent || (distancePercent == thresholdPercent && len(token) == 2) {
38+
return true
39+
}
40+
}
41+
return false
42+
}

util/paragraph_test.go

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
package util
2+
3+
import (
4+
"fmt"
5+
"testing"
6+
)
7+
8+
func TestParagraphMatches(t *testing.T) {
9+
tests := []struct {
10+
query string
11+
content string
12+
want bool
13+
}{
14+
{"string", "SomeString", true},
15+
{"string", "Some String", true},
16+
{"c", "C#", true},
17+
18+
{"Foo", "|Column A|Column B|\n|------|------|\n|Value Foo|Value Bar|", true},
19+
{"Foo", "| Column A | Column B |\n| ------ | ------ |\n| Value Foo | Value Bar |", true},
20+
21+
{"foo", "|Column A|Column B|\n|------|------|\n|Value Foo|Value Bar|", true},
22+
{"foo", "| Column A | Column B |\n| ------ | ------ |\n| Value Foo | Value Bar |", true},
23+
24+
{"booom", "| Thing | Booom |\n", true},
25+
}
26+
27+
for _, test := range tests {
28+
t.Run(fmt.Sprintf("ParagraphMatches('%v', '%v')", test.content, test.query), func(t *testing.T) {
29+
got := ParagraphMatches(test.content, test.query)
30+
if got != test.want {
31+
t.Fatalf("expected %t but was %t", test.want, got)
32+
}
33+
})
34+
}
35+
}

0 commit comments

Comments
 (0)