Skip to content

Reduce lexer memory allocs #811

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 30 additions & 25 deletions file/error.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package file
import (
"fmt"
"strings"
"unicode/utf8"
)

type Error struct {
Expand All @@ -19,43 +18,49 @@ func (e *Error) Error() string {
return e.format()
}

var tabReplacer = strings.NewReplacer("\t", " ")

func (e *Error) Bind(source Source) *Error {
src := source.String()

e.Line = 1
for i, r := range source {
if i == e.From {
var runeCount, lineStart, lineOffset int
for i, r := range src {
if runeCount == e.From {
break
}
if r == '\n' {
lineStart = i
e.Line++
e.Column = 0
lineOffset = 0
} else {
e.Column++
}
runeCount++
lineOffset++
}

lineEnd := lineStart + strings.IndexByte(src[lineStart:], '\n')
if lineEnd < lineStart {
lineEnd = len(src)
}
if lineStart == lineEnd {
return e
}
if snippet, found := source.Snippet(e.Line); found {
snippet := strings.Replace(snippet, "\t", " ", -1)
srcLine := "\n | " + snippet
var bytes = []byte(snippet)
var indLine = "\n | "
for i := 0; i < e.Column && len(bytes) > 0; i++ {
_, sz := utf8.DecodeRune(bytes)
bytes = bytes[sz:]
if sz > 1 {
goto noind
} else {
indLine += "."
}
}
if _, sz := utf8.DecodeRune(bytes); sz > 1 {
goto noind
} else {
indLine += "^"
}
srcLine += indLine

noind:
e.Snippet = srcLine
const prefix = "\n | "
line := src[lineStart:lineEnd]
snippet := new(strings.Builder)
snippet.Grow(2*len(prefix) + len(line) + lineOffset + 1)
snippet.WriteString(prefix)
tabReplacer.WriteString(snippet, line)
snippet.WriteString(prefix)
for i := 0; i < lineOffset; i++ {
snippet.WriteByte('.')
}
snippet.WriteByte('^')
e.Snippet = snippet.String()
return e
}

Expand Down
52 changes: 20 additions & 32 deletions file/source.go
Original file line number Diff line number Diff line change
@@ -1,48 +1,36 @@
package file

import (
"strings"
"unicode/utf8"
)
import "strings"

type Source []rune
type Source struct {
raw string
}

func NewSource(contents string) Source {
return []rune(contents)
return Source{
raw: contents,
}
}

func (s Source) String() string {
return string(s)
return s.raw
}

func (s Source) Snippet(line int) (string, bool) {
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This method is no longer used but I'm keeping it in case someone was using it.

if s == nil {
if s.raw == "" {
return "", false
}
lines := strings.Split(string(s), "\n")
lineOffsets := make([]int, len(lines))
var offset int
for i, line := range lines {
offset = offset + utf8.RuneCountInString(line) + 1
lineOffsets[i] = offset
}
charStart, found := getLineOffset(lineOffsets, line)
if !found || len(s) == 0 {
return "", false
var start int
for i := 1; i < line; i++ {
pos := strings.IndexByte(s.raw[start:], '\n')
if pos < 0 {
return "", false
}
start += pos + 1
}
charEnd, found := getLineOffset(lineOffsets, line+1)
if found {
return string(s[charStart : charEnd-1]), true
}
return string(s[charStart:]), true
}

func getLineOffset(lineOffsets []int, line int) (int, bool) {
if line == 1 {
return 0, true
} else if line > 1 && line <= len(lineOffsets) {
offset := lineOffsets[line-2]
return offset, true
end := start + strings.IndexByte(s.raw[start:], '\n')
if end < start {
end = len(s.raw)
}
return -1, false
return s.raw[start:end], true
}
23 changes: 23 additions & 0 deletions parser/lexer/bench_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package lexer

import (
"testing"

"github.com/expr-lang/expr/file"
)

func BenchmarkParser(b *testing.B) {
const source = `
/*
Showing worst case scenario
*/
let value = trim("contains escapes \n\"\\ \U0001F600 and non ASCII ñ"); // inline comment
len(value) == 0x2A
// let's introduce an error too
whatever
`
b.ReportAllocs()
for i := 0; i < b.N; i++ {
Lex(file.NewSource(source))
}
}
105 changes: 52 additions & 53 deletions parser/lexer/lexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,19 @@ package lexer
import (
"fmt"
"strings"
"unicode/utf8"

"github.com/expr-lang/expr/file"
)

const minTokens = 10
Copy link
Author

@diegommm diegommm Jun 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This allows for optimistically allocating tokens. Discussing the "right" value for this parameter can be a loss of time (everyone has an opinion on what should be the "right" value).
The only thing I will hold is that using zero (the default) is the worst of all values because you will likely always need some nodes. It would be very rare that you don't. And the first few times your add a new node with append then the runtime will just have to copy over and over the same underlying array while it grows the slice.


func Lex(source file.Source) ([]Token, error) {
raw := source.String()
l := &lexer{
source: source,
tokens: make([]Token, 0),
start: 0,
end: 0,
raw: raw,
tokens: make([]Token, 0, minTokens),
}
l.commit()

for state := root; state != nil; {
state = state(l)
Expand All @@ -28,10 +29,15 @@ func Lex(source file.Source) ([]Token, error) {
}

type lexer struct {
source file.Source
raw string
tokens []Token
start, end int
err *file.Error
start, end pos
eof bool
}

type pos struct {
byte, rune int
}

const eof rune = -1
Expand All @@ -41,23 +47,39 @@ func (l *lexer) commit() {
}

func (l *lexer) next() rune {
if l.end >= len(l.source) {
l.end++
if l.end.byte >= len(l.raw) {
l.eof = true
return eof
}
r := l.source[l.end]
l.end++
r, sz := utf8.DecodeRuneInString(l.raw[l.end.byte:])
l.end.rune++
l.end.byte += sz
return r
}

func (l *lexer) peek() rune {
r := l.next()
l.backup()
return r
if l.end.byte < len(l.raw) {
r, _ := utf8.DecodeRuneInString(l.raw[l.end.byte:])
return r
}
return eof
}

func (l *lexer) peekByte() (byte, bool) {
if l.end.byte >= 0 && l.end.byte < len(l.raw) {
return l.raw[l.end.byte], true
}
return 0, false
}

func (l *lexer) backup() {
l.end--
if l.eof {
l.eof = false
} else if l.end.rune > 0 {
_, sz := utf8.DecodeLastRuneInString(l.raw[:l.end.byte])
l.end.byte -= sz
l.end.rune--
}
}

func (l *lexer) emit(t Kind) {
Expand All @@ -66,19 +88,19 @@ func (l *lexer) emit(t Kind) {

func (l *lexer) emitValue(t Kind, value string) {
l.tokens = append(l.tokens, Token{
Location: file.Location{From: l.start, To: l.end},
Location: file.Location{From: l.start.rune, To: l.end.rune},
Kind: t,
Value: value,
})
l.commit()
}

func (l *lexer) emitEOF() {
from := l.end - 2
from := l.end.rune - 1
if from < 0 {
from = 0
}
to := l.end - 1
to := l.end.rune - 0
if to < 0 {
to = 0
}
Expand All @@ -94,60 +116,37 @@ func (l *lexer) skip() {
}

func (l *lexer) word() string {
// TODO: boundary check is NOT needed here, but for some reason CI fuzz tests are failing.
if l.start > len(l.source) || l.end > len(l.source) {
return "__invalid__"
}
return string(l.source[l.start:l.end])
return l.raw[l.start.byte:l.end.byte]
}

func (l *lexer) accept(valid string) bool {
if strings.ContainsRune(valid, l.next()) {
if strings.ContainsRune(valid, l.peek()) {
l.next()
return true
}
l.backup()
return false
}

func (l *lexer) acceptRun(valid string) {
for strings.ContainsRune(valid, l.next()) {
for l.accept(valid) {
}
l.backup()
}

func (l *lexer) skipSpaces() {
r := l.peek()
for ; r == ' '; r = l.peek() {
l.next()
}
l.acceptRun(" ")
l.skip()
}

func (l *lexer) acceptWord(word string) bool {
pos := l.end

l.skipSpaces()

for _, ch := range word {
if l.next() != ch {
l.end = pos
return false
}
}
if r := l.peek(); r != ' ' && r != eof {
l.end = pos
return false
}

return true
}

func (l *lexer) error(format string, args ...any) stateFn {
if l.err == nil { // show first error
end := l.end.rune
if l.eof {
end++
}
l.err = &file.Error{
Location: file.Location{
From: l.end - 1,
To: l.end,
From: end - 1,
To: end,
},
Message: fmt.Sprintf(format, args...),
}
Expand Down Expand Up @@ -225,6 +224,6 @@ func (l *lexer) scanRawString(quote rune) (n int) {
ch = l.next()
n++
}
l.emitValue(String, string(l.source[l.start+1:l.end-1]))
l.emitValue(String, l.raw[l.start.byte+1:l.end.byte-1])
return
}
1 change: 1 addition & 0 deletions parser/lexer/lexer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,7 @@ literal not terminated (1:10)
früh ♥︎
unrecognized character: U+2665 '♥' (1:6)
| früh ♥︎
| .....^
Copy link
Author

@diegommm diegommm Jun 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This appeared to be a small bug. In the original code, if it found a rune longer than 1 byte it would skip the indicator line entirely. I imagine that wouldn't be good if we have something like:

let myStr = "Hello, 世界"; someError

In that case it wouldn't put the indicator line because it will find the '世' rune.

`

func TestLex_error(t *testing.T) {
Expand Down
12 changes: 4 additions & 8 deletions parser/lexer/token.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,13 @@ func (t Token) String() string {
}

func (t Token) Is(kind Kind, values ...string) bool {
if len(values) == 0 {
return kind == t.Kind
if kind != t.Kind {
return false
}

for _, v := range values {
if v == t.Value {
goto found
return true
}
}
return false

found:
return kind == t.Kind
return len(values) == 0
}
Loading