Skip to content

Commit d94d23d

Browse files
explodedclaude
andcommitted
fix(sqlite): convert ANTLR character indices to byte offsets for source extraction
ANTLR's InputStream operates on characters (runes), so token positions returned by GetStop() are character indices. However, source.Pluck() slices Go strings using byte offsets. When multi-byte UTF-8 characters (e.g. em-dash U+2014) appear in SQL comments, this mismatch causes queries to be extracted at wrong positions -- truncating parameter placeholders and leaking comment text into generated Go code. Build a rune-to-byte offset lookup table and use it to translate ANTLR positions before storing StmtLocation and StmtLen. Fixes #4372 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 4bf2159 commit d94d23d

File tree

7 files changed

+129
-4
lines changed

7 files changed

+129
-4
lines changed

internal/endtoend/testdata/sqlite_unicode_comment/db/db.go

Lines changed: 31 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

internal/endtoend/testdata/sqlite_unicode_comment/db/models.go

Lines changed: 10 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

internal/endtoend/testdata/sqlite_unicode_comment/db/query.sql.go

Lines changed: 37 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
-- name: GetItem :one
2+
SELECT id, name FROM items WHERE id = ?;
3+
4+
-- section — divider
5+
6+
-- name: UpdateItem :exec
7+
UPDATE items SET name = ? WHERE id = ?;
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
CREATE TABLE items (id INTEGER PRIMARY KEY, name TEXT NOT NULL);
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
{
2+
"version": "2",
3+
"sql": [
4+
{
5+
"engine": "sqlite",
6+
"queries": "query.sql",
7+
"schema": "schema.sql",
8+
"gen": {
9+
"go": {
10+
"package": "db",
11+
"out": "db"
12+
}
13+
}
14+
}
15+
]
16+
}

internal/engine/sqlite/parse.go

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"errors"
55
"fmt"
66
"io"
7+
"unicode/utf8"
78

89
"github.com/antlr4-go/antlr/v4"
910
"github.com/sqlc-dev/sqlc/internal/engine/sqlite/parser"
@@ -42,7 +43,8 @@ func (p *Parser) Parse(r io.Reader) ([]ast.Statement, error) {
4243
if err != nil {
4344
return nil, err
4445
}
45-
input := antlr.NewInputStream(string(blob))
46+
src := string(blob)
47+
input := antlr.NewInputStream(src)
4648
lexer := parser.NewSQLiteLexer(input)
4749
stream := antlr.NewCommonTokenStream(lexer, 0)
4850
pp := parser.NewSQLiteParser(stream)
@@ -57,6 +59,13 @@ func (p *Parser) Parse(r io.Reader) ([]ast.Statement, error) {
5759
if !ok {
5860
return nil, fmt.Errorf("expected ParserContext; got %T\n", tree)
5961
}
62+
63+
// ANTLR's InputStream operates on characters (runes), so token
64+
// positions are character indices. source.Pluck slices with byte
65+
// offsets. Build a lookup table so we can translate correctly when
66+
// the input contains multi-byte UTF-8 characters (e.g. em-dash).
67+
runeToByteOffset := buildRuneToByteOffsets(src)
68+
6069
var stmts []ast.Statement
6170
for _, istmt := range pctx.AllSql_stmt_list() {
6271
list, ok := istmt.(*parser.Sql_stmt_listContext)
@@ -72,12 +81,13 @@ func (p *Parser) Parse(r io.Reader) ([]ast.Statement, error) {
7281
loc = stmt.GetStop().GetStop() + 2
7382
continue
7483
}
75-
len := (stmt.GetStop().GetStop() + 1) - loc
84+
byteLoc := runeToByteOffset[loc]
85+
byteEnd := runeToByteOffset[stmt.GetStop().GetStop()+1]
7686
stmts = append(stmts, ast.Statement{
7787
Raw: &ast.RawStmt{
7888
Stmt: out,
79-
StmtLocation: loc,
80-
StmtLen: len,
89+
StmtLocation: byteLoc,
90+
StmtLen: byteEnd - byteLoc,
8191
},
8292
})
8393
loc = stmt.GetStop().GetStop() + 2
@@ -86,6 +96,19 @@ func (p *Parser) Parse(r io.Reader) ([]ast.Statement, error) {
8696
return stmts, nil
8797
}
8898

99+
// buildRuneToByteOffsets returns a slice mapping rune index to byte offset.
100+
// Entry i holds the byte offset where rune i begins; the final entry holds
101+
// len(s) so that an exclusive end position can be looked up safely.
102+
func buildRuneToByteOffsets(s string) []int {
103+
n := utf8.RuneCountInString(s)
104+
offsets := make([]int, 0, n+1)
105+
for bytePos := range s {
106+
offsets = append(offsets, bytePos)
107+
}
108+
offsets = append(offsets, len(s))
109+
return offsets
110+
}
111+
89112
func (p *Parser) CommentSyntax() source.CommentSyntax {
90113
return source.CommentSyntax{
91114
Dash: true,

0 commit comments

Comments
 (0)