From b2d0aeb3c78510d11e106e8e9a8a59e38fa96d55 Mon Sep 17 00:00:00 2001 From: Steve van Loben Sels Date: Wed, 2 Nov 2022 09:38:54 -0700 Subject: [PATCH] Added Position field to the Tokenizer (#128) The Position gives a means to index into the Tokenizer's underlying byte slice. This enables use cases where the caller is planning on making edits to the JSON document but wants to leverage the copy func to optimize data movement and/or to copy remaining bytes if the caller wants to exit the tokenizing loop early. --- json/token.go | 18 +++++++++++++++++- json/token_test.go | 33 +++++++++++++++++++++------------ 2 files changed, 38 insertions(+), 13 deletions(-) diff --git a/json/token.go b/json/token.go index b9f46ae..b82f49b 100644 --- a/json/token.go +++ b/json/token.go @@ -31,7 +31,6 @@ import ( // ... // } // } -// type Tokenizer struct { // When the tokenizer is positioned on a json delimiter this field is not // zero. In this case the possible values are '{', '}', '[', ']', ':', and @@ -44,6 +43,17 @@ type Tokenizer struct { // null, true, false, numbers, or quoted strings. Value RawValue + // Position is the Tokenizer's current index into the underlying byte slice. + // Since the Tokenizer has already been advanced by calling Next, this + // position will be the first index of the next token. The position of + // the current Value can be calculated by subtracting len(token.value). + // Accordingly, slicing the underlying bytes like: + // + // b[token.Position-len(token.Value):token.Position] + // + // will yield the current Value. + Position int + // When the tokenizer has encountered invalid content this field is not nil. Err error @@ -92,6 +102,7 @@ func (t *Tokenizer) Reset(b []byte) { // However, it does not compile down to an invocation of duff-copy. t.Delim = 0 t.Value = nil + t.Position = 0 t.Err = nil t.Depth = 0 t.Index = 0 @@ -128,6 +139,7 @@ skipLoop: if i > 0 { t.json = t.json[i:] + t.Position += i } if len(t.json) == 0 { @@ -135,6 +147,8 @@ skipLoop: return false } + lenBefore := len(t.json) + var kind Kind switch t.json[0] { case '"': @@ -165,6 +179,8 @@ skipLoop: t.Value, t.json, t.Err = t.json[:1], t.json[1:], syntaxError(t.json, "expected token but found '%c'", t.json[0]) } + t.Position += lenBefore - len(t.json) + t.Depth = t.depth() t.Index = t.index() t.flags = t.flags.withKind(kind) diff --git a/json/token_test.go b/json/token_test.go index 2805de3..f5dbf65 100644 --- a/json/token_test.go +++ b/json/token_test.go @@ -1,6 +1,7 @@ package json import ( + "bytes" "reflect" "testing" ) @@ -40,22 +41,30 @@ func value(v string, depth, index int) token { } } -func tokenize(b []byte) (tokens []token) { - t := NewTokenizer(b) +func tokenize(t *testing.T, b []byte) (tokens []token) { + tok := NewTokenizer(b) + + for tok.Next() { + start, end := tok.Position-len(tok.Value), tok.Position + if end > len(b) { + t.Fatalf("token position too far [%d:%d], len(b) is %d", start, end, len(b)) + } + if !bytes.Equal(b[start:end], tok.Value) { + t.Fatalf("token position is wrong [%d:%d]", start, end) + } - for t.Next() { tokens = append(tokens, token{ - delim: t.Delim, - value: t.Value, - err: t.Err, - depth: t.Depth, - index: t.Index, - isKey: t.IsKey, + delim: tok.Delim, + value: tok.Value, + err: tok.Err, + depth: tok.Depth, + index: tok.Index, + isKey: tok.IsKey, }) } - if t.Err != nil { - panic(t.Err) + if tok.Err != nil { + t.Fatal(tok.Err) } return @@ -174,7 +183,7 @@ func TestTokenizer(t *testing.T) { for _, test := range tests { t.Run(string(test.input), func(t *testing.T) { - tokens := tokenize(test.input) + tokens := tokenize(t, test.input) if !reflect.DeepEqual(tokens, test.tokens) { t.Error("tokens mismatch")