-
-
Notifications
You must be signed in to change notification settings - Fork 445
Reduce lexer memory allocs #811
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,48 +1,36 @@ | ||
package file | ||
|
||
import ( | ||
"strings" | ||
"unicode/utf8" | ||
) | ||
import "strings" | ||
|
||
type Source []rune | ||
type Source struct { | ||
raw string | ||
} | ||
|
||
func NewSource(contents string) Source { | ||
return []rune(contents) | ||
return Source{ | ||
raw: contents, | ||
} | ||
} | ||
|
||
func (s Source) String() string { | ||
return string(s) | ||
return s.raw | ||
} | ||
|
||
func (s Source) Snippet(line int) (string, bool) { | ||
if s == nil { | ||
if s.raw == "" { | ||
return "", false | ||
} | ||
lines := strings.Split(string(s), "\n") | ||
lineOffsets := make([]int, len(lines)) | ||
var offset int | ||
for i, line := range lines { | ||
offset = offset + utf8.RuneCountInString(line) + 1 | ||
lineOffsets[i] = offset | ||
} | ||
charStart, found := getLineOffset(lineOffsets, line) | ||
if !found || len(s) == 0 { | ||
return "", false | ||
var start int | ||
for i := 1; i < line; i++ { | ||
pos := strings.IndexByte(s.raw[start:], '\n') | ||
if pos < 0 { | ||
return "", false | ||
} | ||
start += pos + 1 | ||
} | ||
charEnd, found := getLineOffset(lineOffsets, line+1) | ||
if found { | ||
return string(s[charStart : charEnd-1]), true | ||
} | ||
return string(s[charStart:]), true | ||
} | ||
|
||
func getLineOffset(lineOffsets []int, line int) (int, bool) { | ||
if line == 1 { | ||
return 0, true | ||
} else if line > 1 && line <= len(lineOffsets) { | ||
offset := lineOffsets[line-2] | ||
return offset, true | ||
end := start + strings.IndexByte(s.raw[start:], '\n') | ||
if end < start { | ||
end = len(s.raw) | ||
} | ||
return -1, false | ||
return s.raw[start:end], true | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
package lexer | ||
|
||
import ( | ||
"testing" | ||
|
||
"github.com/expr-lang/expr/file" | ||
) | ||
|
||
func BenchmarkParser(b *testing.B) { | ||
const source = ` | ||
/* | ||
Showing worst case scenario | ||
*/ | ||
let value = trim("contains escapes \n\"\\ \U0001F600 and non ASCII ñ"); // inline comment | ||
len(value) == 0x2A | ||
// let's introduce an error too | ||
whatever | ||
` | ||
b.ReportAllocs() | ||
for i := 0; i < b.N; i++ { | ||
Lex(file.NewSource(source)) | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,18 +3,19 @@ package lexer | |
import ( | ||
"fmt" | ||
"strings" | ||
"unicode/utf8" | ||
|
||
"github.com/expr-lang/expr/file" | ||
) | ||
|
||
const minTokens = 10 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This allows for optimistically allocating tokens. Discussing the "right" value for this parameter can be a loss of time (everyone has an opinion on what should be the "right" value). |
||
|
||
func Lex(source file.Source) ([]Token, error) { | ||
raw := source.String() | ||
l := &lexer{ | ||
source: source, | ||
tokens: make([]Token, 0), | ||
start: 0, | ||
end: 0, | ||
raw: raw, | ||
tokens: make([]Token, 0, minTokens), | ||
} | ||
l.commit() | ||
|
||
for state := root; state != nil; { | ||
state = state(l) | ||
|
@@ -28,10 +29,15 @@ func Lex(source file.Source) ([]Token, error) { | |
} | ||
|
||
type lexer struct { | ||
source file.Source | ||
raw string | ||
tokens []Token | ||
start, end int | ||
err *file.Error | ||
start, end pos | ||
eof bool | ||
} | ||
|
||
type pos struct { | ||
byte, rune int | ||
} | ||
|
||
const eof rune = -1 | ||
|
@@ -41,23 +47,39 @@ func (l *lexer) commit() { | |
} | ||
|
||
func (l *lexer) next() rune { | ||
if l.end >= len(l.source) { | ||
l.end++ | ||
if l.end.byte >= len(l.raw) { | ||
l.eof = true | ||
return eof | ||
} | ||
r := l.source[l.end] | ||
l.end++ | ||
r, sz := utf8.DecodeRuneInString(l.raw[l.end.byte:]) | ||
l.end.rune++ | ||
l.end.byte += sz | ||
return r | ||
} | ||
|
||
func (l *lexer) peek() rune { | ||
r := l.next() | ||
l.backup() | ||
return r | ||
if l.end.byte < len(l.raw) { | ||
r, _ := utf8.DecodeRuneInString(l.raw[l.end.byte:]) | ||
return r | ||
} | ||
return eof | ||
} | ||
|
||
func (l *lexer) peekByte() (byte, bool) { | ||
if l.end.byte >= 0 && l.end.byte < len(l.raw) { | ||
return l.raw[l.end.byte], true | ||
} | ||
return 0, false | ||
} | ||
|
||
func (l *lexer) backup() { | ||
l.end-- | ||
if l.eof { | ||
l.eof = false | ||
} else if l.end.rune > 0 { | ||
_, sz := utf8.DecodeLastRuneInString(l.raw[:l.end.byte]) | ||
l.end.byte -= sz | ||
l.end.rune-- | ||
} | ||
} | ||
|
||
func (l *lexer) emit(t Kind) { | ||
|
@@ -66,19 +88,19 @@ func (l *lexer) emit(t Kind) { | |
|
||
func (l *lexer) emitValue(t Kind, value string) { | ||
l.tokens = append(l.tokens, Token{ | ||
Location: file.Location{From: l.start, To: l.end}, | ||
Location: file.Location{From: l.start.rune, To: l.end.rune}, | ||
Kind: t, | ||
Value: value, | ||
}) | ||
l.commit() | ||
} | ||
|
||
func (l *lexer) emitEOF() { | ||
from := l.end - 2 | ||
from := l.end.rune - 1 | ||
if from < 0 { | ||
from = 0 | ||
} | ||
to := l.end - 1 | ||
to := l.end.rune - 0 | ||
if to < 0 { | ||
to = 0 | ||
} | ||
|
@@ -94,60 +116,37 @@ func (l *lexer) skip() { | |
} | ||
|
||
func (l *lexer) word() string { | ||
// TODO: boundary check is NOT needed here, but for some reason CI fuzz tests are failing. | ||
if l.start > len(l.source) || l.end > len(l.source) { | ||
return "__invalid__" | ||
} | ||
return string(l.source[l.start:l.end]) | ||
return l.raw[l.start.byte:l.end.byte] | ||
} | ||
|
||
func (l *lexer) accept(valid string) bool { | ||
if strings.ContainsRune(valid, l.next()) { | ||
if strings.ContainsRune(valid, l.peek()) { | ||
l.next() | ||
return true | ||
} | ||
l.backup() | ||
return false | ||
} | ||
|
||
func (l *lexer) acceptRun(valid string) { | ||
for strings.ContainsRune(valid, l.next()) { | ||
for l.accept(valid) { | ||
} | ||
l.backup() | ||
} | ||
|
||
func (l *lexer) skipSpaces() { | ||
r := l.peek() | ||
for ; r == ' '; r = l.peek() { | ||
l.next() | ||
} | ||
l.acceptRun(" ") | ||
l.skip() | ||
} | ||
|
||
func (l *lexer) acceptWord(word string) bool { | ||
pos := l.end | ||
|
||
l.skipSpaces() | ||
|
||
for _, ch := range word { | ||
if l.next() != ch { | ||
l.end = pos | ||
return false | ||
} | ||
} | ||
if r := l.peek(); r != ' ' && r != eof { | ||
l.end = pos | ||
return false | ||
} | ||
|
||
return true | ||
} | ||
|
||
func (l *lexer) error(format string, args ...any) stateFn { | ||
if l.err == nil { // show first error | ||
end := l.end.rune | ||
if l.eof { | ||
end++ | ||
} | ||
l.err = &file.Error{ | ||
Location: file.Location{ | ||
From: l.end - 1, | ||
To: l.end, | ||
From: end - 1, | ||
To: end, | ||
}, | ||
Message: fmt.Sprintf(format, args...), | ||
} | ||
|
@@ -225,6 +224,6 @@ func (l *lexer) scanRawString(quote rune) (n int) { | |
ch = l.next() | ||
n++ | ||
} | ||
l.emitValue(String, string(l.source[l.start+1:l.end-1])) | ||
l.emitValue(String, l.raw[l.start.byte+1:l.end.byte-1]) | ||
return | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -335,6 +335,7 @@ literal not terminated (1:10) | |
früh ♥︎ | ||
unrecognized character: U+2665 '♥' (1:6) | ||
| früh ♥︎ | ||
| .....^ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This appeared to be a small bug. In the original code, if it found a rune longer than 1 byte it would skip the indicator line entirely. I imagine that wouldn't be good if we have something like:
In that case it wouldn't put the indicator line because it will find the '世' rune. |
||
` | ||
|
||
func TestLex_error(t *testing.T) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This method is no longer used but I'm keeping it in case someone was using it.