Added Position field to the Tokenizer (#128)

The Position gives a means to index into the Tokenizer's underlying byte slice. This enables use cases where the caller is planning on making edits to the JSON document but wants to leverage the copy func to optimize data movement and/or to copy remaining bytes if the caller wants to exit the tokenizing loop early.
segmentio · Nov 2, 2022 · b2d0aeb · b2d0aeb
1 parent 3391c4a
commit b2d0aeb
Show file tree

Hide file tree

Showing 2 changed files with 38 additions and 13 deletions.
diff --git a/json/token.go b/json/token.go
@@ -31,7 +31,6 @@ import (
 //			...
 //		}
 //	}
-//
 type Tokenizer struct {
 	// When the tokenizer is positioned on a json delimiter this field is not
 	// zero. In this case the possible values are '{', '}', '[', ']', ':', and
@@ -44,6 +43,17 @@ type Tokenizer struct {
 	// null, true, false, numbers, or quoted strings.
 	Value RawValue
 
+	// Position is the Tokenizer's current index into the underlying byte slice.
+	// Since the Tokenizer has already been advanced by calling Next, this
+	// position will be the first index of the next token.  The position of
+	// the current Value can be calculated by subtracting len(token.value).
+	// Accordingly, slicing the underlying bytes like:
+	//
+	//   b[token.Position-len(token.Value):token.Position]
+	//
+	// will yield the current Value.
+	Position int
+
 	// When the tokenizer has encountered invalid content this field is not nil.
 	Err error
 
@@ -92,6 +102,7 @@ func (t *Tokenizer) Reset(b []byte) {
 	// However, it does not compile down to an invocation of duff-copy.
 	t.Delim = 0
 	t.Value = nil
+	t.Position = 0
 	t.Err = nil
 	t.Depth = 0
 	t.Index = 0
@@ -128,13 +139,16 @@ skipLoop:
 
 	if i > 0 {
 		t.json = t.json[i:]
+		t.Position += i
 	}
 
 	if len(t.json) == 0 {
 		t.Reset(nil)
 		return false
 	}
 
+	lenBefore := len(t.json)
+
 	var kind Kind
 	switch t.json[0] {
 	case '"':
@@ -165,6 +179,8 @@ skipLoop:
 		t.Value, t.json, t.Err = t.json[:1], t.json[1:], syntaxError(t.json, "expected token but found '%c'", t.json[0])
 	}
 
+	t.Position += lenBefore - len(t.json)
+
 	t.Depth = t.depth()
 	t.Index = t.index()
 	t.flags = t.flags.withKind(kind)

diff --git a/json/token_test.go b/json/token_test.go
@@ -1,6 +1,7 @@
 package json
 
 import (
+	"bytes"
 	"reflect"
 	"testing"
 )
@@ -40,22 +41,30 @@ func value(v string, depth, index int) token {
 	}
 }
 
-func tokenize(b []byte) (tokens []token) {
-	t := NewTokenizer(b)
+func tokenize(t *testing.T, b []byte) (tokens []token) {
+	tok := NewTokenizer(b)
+
+	for tok.Next() {
+		start, end := tok.Position-len(tok.Value), tok.Position
+		if end > len(b) {
+			t.Fatalf("token position too far [%d:%d], len(b) is %d", start, end, len(b))
+		}
+		if !bytes.Equal(b[start:end], tok.Value) {
+			t.Fatalf("token position is wrong [%d:%d]", start, end)
+		}
 
-	for t.Next() {
 		tokens = append(tokens, token{
-			delim: t.Delim,
-			value: t.Value,
-			err:   t.Err,
-			depth: t.Depth,
-			index: t.Index,
-			isKey: t.IsKey,
+			delim: tok.Delim,
+			value: tok.Value,
+			err:   tok.Err,
+			depth: tok.Depth,
+			index: tok.Index,
+			isKey: tok.IsKey,
 		})
 	}
 
-	if t.Err != nil {
-		panic(t.Err)
+	if tok.Err != nil {
+		t.Fatal(tok.Err)
 	}
 
 	return
@@ -174,7 +183,7 @@ func TestTokenizer(t *testing.T) {
 
 	for _, test := range tests {
 		t.Run(string(test.input), func(t *testing.T) {
-			tokens := tokenize(test.input)
+			tokens := tokenize(t, test.input)
 
 			if !reflect.DeepEqual(tokens, test.tokens) {
 				t.Error("tokens mismatch")