Skip to content

Commit 624fae0

Browse files
authored
Merge pull request #101 from ipld/codectools-tokenizers
Fresh take on codec APIs, and some tokenization utilities.
2 parents 35ad3e3 + 1110155 commit 624fae0

File tree

8 files changed

+873
-0
lines changed

8 files changed

+873
-0
lines changed

codec/api.go

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
package codec
2+
3+
import (
4+
"io"
5+
6+
"github.com/ipld/go-ipld-prime"
7+
)
8+
9+
// Encoder is the essential definition of a function that takes IPLD Data Model data in memory and serializes it.
10+
// IPLD Codecs are written by implementing this function interface (as well as (typically) a matched Decoder).
11+
//
12+
// Encoder functions can be composed into an ipld.LinkSystem to provide
13+
// a "one stop shop" API for handling content addressable storage.
14+
// Encoder functions can also be used directly if you want to handle serial data streams.
15+
//
16+
// Most codec packages will have a ReusableEncoder type
17+
// (which contains any working memory needed by the encoder implementation,
18+
// as well as any configuration options),
19+
// and that type will have an Encode function matching this interface.
20+
//
21+
// By convention, codec packages that have a multicodec contract will also have
22+
// a package-scope exported function called Encode which also matches this interface,
23+
// and is the equivalent of creating a zero-value ReusableEncoder (aka, default config)
24+
// and using its Encode method.
25+
// This package-scope function will typically also internally use a sync.Pool
26+
// to keep some ReusableEncoder values on hand to avoid unnecesary allocations.
27+
//
28+
// Note that a ReusableEncoder type that supports configuration options
29+
// does not functionally expose those options when invoked by the multicodec system --
30+
// multicodec indicators do not provide room for extended configuration info.
31+
// Codecs that expose configuration options are doing so for library users to enjoy;
32+
// it does not mean those non-default configurations will necessarly be available
33+
// in all scenarios that use codecs indirectly.
34+
// There is also no standard interface for such configurations: by nature,
35+
// if they exist at all, they vary per codec.
36+
type Encoder func(data ipld.Node, output io.Writer) error
37+
38+
// Decoder is the essential definiton of a function that consumes serial data and unfurls it into IPLD Data Model-compatible in-memory representations.
39+
// IPLD Codecs are written by implementing this function interface (as well as (typically) a matched Encoder).
40+
//
41+
// Decoder is the dual of Encoder.
42+
// Most of the documentation for the Encoder function interface
43+
// also applies wholesale to the Decoder interface.
44+
type Decoder func(into ipld.NodeAssembler, input io.Reader) error
45+
46+
type ErrBudgetExhausted struct{}
47+
48+
func (e ErrBudgetExhausted) Error() string {
49+
return "decoder resource budget exhausted (message too long or too complex)"
50+
}

codec/codectools/token.go

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
package codectools
2+
3+
import (
4+
"fmt"
5+
6+
"github.com/ipld/go-ipld-prime"
7+
)
8+
9+
type Token struct {
10+
Kind TokenKind
11+
12+
Length int // Present for MapOpen or ListOpen. May be -1 for "unknown" (e.g. a json tokenizer will yield this).
13+
Bool bool // Value. Union: only has meaning if Kind is TokenKind_Bool.
14+
Int int64 // Value. Union: only has meaning if Kind is TokenKind_Int.
15+
Float float64 // Value. Union: only has meaning if Kind is TokenKind_Float.
16+
Str string // Value. Union: only has meaning if Kind is TokenKind_String. ('Str' rather than 'String' to avoid collision with method.)
17+
Bytes []byte // Value. Union: only has meaning if Kind is TokenKind_Bytes.
18+
Link ipld.Link // Value. Union: only has meaning if Kind is TokenKind_Link.
19+
20+
Node ipld.Node // Direct pointer to the original data, if this token is used to communicate data during a walk of existing in-memory data. Absent when token is being used during deserialization.
21+
22+
// The following fields all track position and progress:
23+
// (These may be useful to copy into any error messages if errors arise.)
24+
// (Implementations may assume token reuse and treat these as state keeping;
25+
// you may experience position accounting accuracy problems if *not* reusing tokens or if zeroing these fields.)
26+
27+
pth []ipld.PathSegment // Set by token producers (whether marshallers or deserializers) to track logical position.
28+
offset int64 // Set by deserializers (for both textual or binary formats alike) to track progress.
29+
lineOffset int64 // Set by deserializers that work with textual data. May be ignored by binary deserializers.
30+
columnOffset int64 // Set by deserializers that work with textual data. May be ignored by binary deserializers.
31+
}
32+
33+
func (tk Token) String() string {
34+
switch tk.Kind {
35+
case TokenKind_MapOpen:
36+
return fmt.Sprintf("<%c:%d>", tk.Kind, tk.Length)
37+
case TokenKind_MapClose:
38+
return fmt.Sprintf("<%c>", tk.Kind)
39+
case TokenKind_ListOpen:
40+
return fmt.Sprintf("<%c:%d>", tk.Kind, tk.Length)
41+
case TokenKind_ListClose:
42+
return fmt.Sprintf("<%c>", tk.Kind)
43+
case TokenKind_Null:
44+
return fmt.Sprintf("<%c>", tk.Kind)
45+
case TokenKind_Bool:
46+
return fmt.Sprintf("<%c:%v>", tk.Kind, tk.Bool)
47+
case TokenKind_Int:
48+
return fmt.Sprintf("<%c:%v>", tk.Kind, tk.Int)
49+
case TokenKind_Float:
50+
return fmt.Sprintf("<%c:%v>", tk.Kind, tk.Float)
51+
case TokenKind_String:
52+
return fmt.Sprintf("<%c:%q>", tk.Kind, tk.Str)
53+
case TokenKind_Bytes:
54+
return fmt.Sprintf("<%c:%x>", tk.Kind, tk.Bytes)
55+
case TokenKind_Link:
56+
return fmt.Sprintf("<%c:%v>", tk.Kind, tk.Link)
57+
default:
58+
return "<INVALID>"
59+
}
60+
}
61+
62+
type TokenKind uint8
63+
64+
const (
65+
TokenKind_MapOpen TokenKind = '{'
66+
TokenKind_MapClose TokenKind = '}'
67+
TokenKind_ListOpen TokenKind = '['
68+
TokenKind_ListClose TokenKind = ']'
69+
TokenKind_Null TokenKind = '0'
70+
TokenKind_Bool TokenKind = 'b'
71+
TokenKind_Int TokenKind = 'i'
72+
TokenKind_Float TokenKind = 'f'
73+
TokenKind_String TokenKind = 's'
74+
TokenKind_Bytes TokenKind = 'x'
75+
TokenKind_Link TokenKind = '/'
76+
)
77+
78+
type ErrMalformedTokenSequence struct {
79+
Detail string
80+
}
81+
82+
func (e ErrMalformedTokenSequence) Error() string {
83+
return "malformed token sequence: " + e.Detail
84+
}
Lines changed: 258 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,258 @@
1+
package codectools
2+
3+
import (
4+
"fmt"
5+
"io"
6+
7+
"github.com/ipld/go-ipld-prime"
8+
"github.com/ipld/go-ipld-prime/codec"
9+
)
10+
11+
// TokenAssemble takes an ipld.NodeAssembler and a TokenReader,
12+
// and repeatedly pumps the TokenReader for tokens and feeds their data into the ipld.NodeAssembler
13+
// until it finishes a complete value.
14+
//
15+
// To compare and contrast to other token oriented tools:
16+
// TokenAssemble does the same direction of information transfer as the TokenAssembler gadget does,
17+
// but TokenAssemble moves completely through a value in one step,
18+
// whereas the TokenAssembler accepts tokens pumped into it one step at a time.
19+
//
20+
// TokenAssemble does not enforce the "map keys must be strings" rule which is present in the Data Model;
21+
// it will also happily do even recursive structures in map keys,
22+
// meaning it can be used when handling schema values like maps with complex keys.
23+
func TokenAssemble(na ipld.NodeAssembler, tr TokenReader, budget int) error {
24+
tk, err := tr(&budget)
25+
if err != nil {
26+
return err
27+
}
28+
return tokenAssemble(na, tk, tr, &budget)
29+
}
30+
31+
func tokenAssemble(na ipld.NodeAssembler, tk *Token, tr TokenReader, budget *int) error {
32+
if *budget < 0 {
33+
return codec.ErrBudgetExhausted{}
34+
}
35+
switch tk.Kind {
36+
case TokenKind_MapOpen:
37+
if tk.Length > 0 && *budget < tk.Length*2 { // Pre-check budget: at least two decrements estimated for each entry.
38+
return codec.ErrBudgetExhausted{}
39+
}
40+
ma, err := na.BeginMap(tk.Length)
41+
if err != nil {
42+
return err
43+
}
44+
for {
45+
// Peek one token. We need to see if the map is about to end or not.
46+
tk, err = tr(budget)
47+
if err != nil {
48+
return err
49+
}
50+
// If the map has ended, invoke the finish operation and check for any errors.
51+
if tk.Kind == TokenKind_MapClose {
52+
return ma.Finish()
53+
}
54+
// Recurse to assemble the key.
55+
*budget-- // Decrement budget by at least one for each key. The key content may also cause further decrements.
56+
if err = tokenAssemble(ma.AssembleKey(), tk, tr, budget); err != nil {
57+
return err
58+
}
59+
// Recurse to assemble the value.
60+
// (We don't really care to peek this token, but do so anyway to keep the calling convention regular.)
61+
tk, err = tr(budget)
62+
if err != nil {
63+
return err
64+
}
65+
*budget-- // Decrement budget by at least one for each value. The value content may also cause further decrements.
66+
if err = tokenAssemble(ma.AssembleValue(), tk, tr, budget); err != nil {
67+
return err
68+
}
69+
// Continue around the loop, to encounter either the next entry or the end of the map.
70+
}
71+
case TokenKind_MapClose:
72+
return ErrMalformedTokenSequence{"map close token encountered while not in the middle of a map"}
73+
case TokenKind_ListOpen:
74+
if tk.Length > 0 && *budget < tk.Length { // Pre-check budget: at least one decrement estimated for each entry.
75+
return codec.ErrBudgetExhausted{}
76+
}
77+
la, err := na.BeginList(tk.Length)
78+
if err != nil {
79+
return err
80+
}
81+
for {
82+
// Peek one token. We need to see if the list is about to end or not.
83+
tk, err = tr(budget)
84+
if err != nil {
85+
return err
86+
}
87+
// If the list has ended, invoke the finish operation and check for any errors.
88+
if tk.Kind == TokenKind_ListClose {
89+
return la.Finish()
90+
}
91+
// Recurse to assemble the value.
92+
*budget-- // Decrement budget by at least one for each value. The value content may also cause further decrements.
93+
if err = tokenAssemble(la.AssembleValue(), tk, tr, budget); err != nil {
94+
return err
95+
}
96+
// Continue around the loop, to encounter either the next value or the end of the list.
97+
}
98+
case TokenKind_ListClose:
99+
return ErrMalformedTokenSequence{"list close token encountered while not in the middle of a list"}
100+
case TokenKind_Null:
101+
return na.AssignNull()
102+
case TokenKind_Bool:
103+
*budget--
104+
return na.AssignBool(tk.Bool)
105+
case TokenKind_Int:
106+
*budget--
107+
return na.AssignInt(int(tk.Int))
108+
case TokenKind_Float:
109+
*budget--
110+
return na.AssignFloat(tk.Float)
111+
case TokenKind_String:
112+
*budget -= len(tk.Str)
113+
return na.AssignString(tk.Str)
114+
case TokenKind_Bytes:
115+
*budget -= len(tk.Bytes)
116+
return na.AssignBytes(tk.Bytes)
117+
case TokenKind_Link:
118+
*budget--
119+
return na.AssignLink(tk.Link)
120+
default:
121+
panic(fmt.Errorf("unrecognized token kind (%q?)", tk.Kind))
122+
}
123+
}
124+
125+
// --- the stepwise assembler system (more complicated; has a userland stack) is below -->
126+
127+
type TokenAssembler struct {
128+
// This structure is designed to be embeddable. Use Initialize when doing so.
129+
130+
stk assemblerStack // this is going to end up being a stack you know
131+
budget int64
132+
}
133+
134+
type assemblerStackRow struct {
135+
state uint8 // 0: assign this node; 1: continue list; 2: continue map with key; 3: continue map with value.
136+
na ipld.NodeAssembler // Always present.
137+
la ipld.ListAssembler // At most one of these is present.
138+
ma ipld.MapAssembler // At most one of these is present.
139+
}
140+
type assemblerStack []assemblerStackRow
141+
142+
func (stk assemblerStack) Tip() *assemblerStackRow {
143+
return &stk[len(stk)-1]
144+
}
145+
func (stk *assemblerStack) Push(na ipld.NodeAssembler) {
146+
*stk = append(*stk, assemblerStackRow{na: na})
147+
}
148+
func (stk *assemblerStack) Pop() {
149+
if len(*stk) == 0 {
150+
return
151+
}
152+
*stk = (*stk)[0 : len(*stk)-1]
153+
}
154+
155+
func (ta *TokenAssembler) Initialize(na ipld.NodeAssembler, budget int64) {
156+
if ta.stk == nil {
157+
ta.stk = make(assemblerStack, 0, 10)
158+
} else {
159+
ta.stk = ta.stk[0:0]
160+
}
161+
ta.stk.Push(na)
162+
ta.budget = budget
163+
}
164+
165+
// Process takes a Token pointer as an argument.
166+
// (Notice how this function happens to match the definition of the visitFn that's usable as an argument to TokenWalk.)
167+
// The token argument can be understood to be "borrowed" for the duration of the Process call, but will not be mutated.
168+
// The use of a pointer here is so that a single Token can be reused by multiple calls, avoiding unnecessary allocations.
169+
//
170+
// Note that Process does very little sanity checking of token sequences itself,
171+
// mostly handing information to the NodeAssemblers directly,
172+
// which presumably will reject the data if it is out of line.
173+
// The NodeAssembler this TokenAssembler is wrapping should already be enforcing the relevant logical rules,
174+
// so it is not useful for TokenAssembler.Process to attempt to duplicate those checks;
175+
// TokenAssembler.Process will also return any errors from the NodeAssembler without attempting to enforce a pattern on those errors.
176+
// In particular, TokenAssembler.Process does not check if every MapOpen is paired with a MapClose;
177+
// it does not check if every ListOpen is paired with a ListClose;
178+
// and it does not check if the token stream is continuing after all open recursives have been closed.
179+
// TODO: review this documentation; more of these checks turn out necessary anyway than originally expected.
180+
func (ta *TokenAssembler) Process(tk *Token) (err error) {
181+
if len(ta.stk) == 0 {
182+
return io.EOF
183+
}
184+
tip := ta.stk.Tip()
185+
switch tip.state {
186+
case 0:
187+
switch tk.Kind {
188+
case TokenKind_MapOpen:
189+
tip.ma, err = tip.na.BeginMap(tk.Length)
190+
tip.state = 2
191+
return err
192+
case TokenKind_MapClose:
193+
// Mostly we try to just forward things, but can't not check this one: tip.ma would be nil; there's reasonable target for forwarding.
194+
return ErrMalformedTokenSequence{"map close token encountered while not in the middle of a map"}
195+
case TokenKind_ListOpen:
196+
tip.la, err = tip.na.BeginList(tk.Length)
197+
tip.state = 1
198+
return err
199+
case TokenKind_ListClose:
200+
// Mostly we try to just forward things, but can't not check this one: tip.la would be nil; there's reasonable target for forwarding.
201+
return ErrMalformedTokenSequence{"list close token encountered while not in the middle of a list"}
202+
case TokenKind_Null:
203+
err = tip.na.AssignNull()
204+
ta.stk.Pop()
205+
return err
206+
case TokenKind_Bool:
207+
err = tip.na.AssignBool(tk.Bool)
208+
ta.stk.Pop()
209+
return err
210+
case TokenKind_Int:
211+
err = tip.na.AssignInt(int(tk.Int)) // TODO: upgrade all of ipld to use high precision int consistently
212+
ta.stk.Pop()
213+
return err
214+
case TokenKind_Float:
215+
err = tip.na.AssignFloat(tk.Float)
216+
ta.stk.Pop()
217+
return err
218+
case TokenKind_String:
219+
err = tip.na.AssignString(tk.Str)
220+
ta.stk.Pop()
221+
return err
222+
case TokenKind_Bytes:
223+
err = tip.na.AssignBytes(tk.Bytes)
224+
ta.stk.Pop()
225+
return err
226+
case TokenKind_Link:
227+
err = tip.na.AssignLink(tk.Link)
228+
ta.stk.Pop()
229+
return err
230+
default:
231+
panic(fmt.Errorf("unrecognized token kind (%q?)", tk.Kind))
232+
}
233+
return nil
234+
case 1:
235+
if tk.Kind == TokenKind_ListClose {
236+
err = tip.la.Finish()
237+
ta.stk.Pop()
238+
return err
239+
}
240+
ta.stk.Push(tip.la.AssembleValue())
241+
return ta.Process(tk)
242+
case 2:
243+
if tk.Kind == TokenKind_MapClose {
244+
err = tip.ma.Finish()
245+
ta.stk.Pop()
246+
return err
247+
}
248+
tip.state = 3
249+
ta.stk.Push(tip.ma.AssembleKey())
250+
return ta.Process(tk)
251+
case 3:
252+
tip.state = 2
253+
ta.stk.Push(tip.ma.AssembleValue())
254+
return ta.Process(tk)
255+
default:
256+
panic("unreachable")
257+
}
258+
}

0 commit comments

Comments
 (0)