From 09ad1b722afb8d3b7029a458b6e289370e2d36db Mon Sep 17 00:00:00 2001 From: Eric Myhre Date: Thu, 12 Aug 2021 16:16:34 +0200 Subject: [PATCH] codecs: more docs, a terminology guide, consistency in options. No breaking changes. In fact I think this should be behaviorally identical to before, despite the size of the diff. There are some functions that are explicitly marked as deprecated now, but I am not removing them in this diff. One exception: some struct names did change. However, these have not been included in any tagged release so far, so I believe changing them now is still pretty fair game, and the number of affected people should be slim to none. The main source of changes is pushing towards nomenclature consistency, which is described in the new readme file also included the diff. --- CHANGELOG.md | 2 +- codec/README.md | 77 +++++++++++++++++++++++++++++++++++++ codec/api.go | 8 ++++ codec/cbor/multicodec.go | 20 +++++++--- codec/dagcbor/marshal.go | 75 +++++++++++++++++++++++++----------- codec/dagcbor/multicodec.go | 46 +++++++++++----------- codec/dagcbor/unmarshal.go | 35 +++++++++++++++-- codec/dagjson/marshal.go | 48 +++++++++++++++++++---- codec/dagjson/multicodec.go | 63 ++++++++++++------------------ codec/dagjson/unmarshal.go | 51 ++++++++++++++++++++++-- codec/json/multicodec.go | 45 +++++++--------------- codec/marshal.go | 6 +++ codec/unmarshal.go | 6 +++ node/tests/testcase.go | 5 ++- 14 files changed, 348 insertions(+), 139 deletions(-) create mode 100644 codec/README.md diff --git a/CHANGELOG.md b/CHANGELOG.md index f57ad816..481579f9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -55,7 +55,7 @@ When a release tag is made, this block of bullet points will just slide down to - The codecs do not reject other orderings when parsing serial data. The `ipld.Node` trees resulting from deserialization will still preserve the serialized order. However, it has now become impossible to re-encode data in that same preserved order. - - If doing your own encoding, there are customization options in `dagcbor.MarshalOptions.MapSortMode` and `dagjson.MarshalOptions.SortMapKeys`. + - If doing your own encoding, there are customization options in `dagcbor.EncodeOptions.MapSortMode` and `dagjson.EncodeOptions.MapSortMode`. (However, note that these options are not available to you while using any systems that only operate in terms of multicodec codes.) - _Be cautious of this change._ It is now extremely easy to write code which puts data into an `ipld.Node` in memory in one order, then save and load that data using these codecs, and end up with different data as a result because the sorting changes the order of data. diff --git a/codec/README.md b/codec/README.md new file mode 100644 index 00000000..8cc753ac --- /dev/null +++ b/codec/README.md @@ -0,0 +1,77 @@ +Codecs +====== + +The `go-ipld-prime/codec` package is a grouping package. +The subpackages contains some codecs which reside in this repo. + +The codecs included here are our "batteries included" codecs, +but they are not otherwise special. + +It is not necessary for a codec to be a subpackage here to be a valid codec to use with go-ipld-prime; +anything that implements the `ipld.Encoder` and `ipld.Decoder` interfaces is fine. + + +Terminology +----------- + +We generally refer to "codecs" as having an "encode" function and "decode" function. + +We consider "encoding" to be the process of going from {Data Model} to {serial data}, +and "decoding" to be the process of going from {serial data} to {Data Model}. + +### Codec vs Multicodec + +A "codec" is _any_ function that goes from {Data Model} to {serial data}, or vice versa. + +A "multicodec" is a function which does that and is _also_ specifically recognized and described in +the tables in https://github.com/multiformats/multicodec/ . + +Multicodecs generally leave no further room for customization and configuration, +because their entire behavior is supposed to be specified by a multicodec indicator code number. + +Our codecs, in the child packages of this one, usually offer configuration options. +They also usually offer exactly one function, which does *not* allow configuration, +which is supplying a multicodec-compatible behavior. +You'll see this marked in the docs on those functions. + +### Marshal vs Encode + +It's common to see the terms "marshal" and "unmarshal" used in golang. + +Those terms are usually describing when structured data is transformed into linearized, tokenized data +(and then, perhaps, all the way to serially encoded data), or vice versa. + +We would use the words the same way... except we don't end up using them, +because that feature doesn't really come up in our codec layer. + +In IPLD, we would describe mapping some typed data into Data Model as "marshalling". +(It's one step shy of tokenizing, but barely: Data Model does already have defined ordering for every element of data.) +And we do have systems that do this: +`bindnode` and our codegen systems both do this, implicitly, when they give you an `ipld.Node` of the representation of some data. + +We just don't end up talking about it as "marshalling" because of how it's done implicitly by those systems. +As a result, all of our features relating to codecs only end up speaking about "encoding" and "decoding". + +### Legacy code + +There are some appearances of the words "marshal" and "unmarshal" in some of our subpackages here. + +That verbiage is generally on the way out. +For functions and structures with those names, you'll notice their docs marking them as deprecated. + + +Why have "batteries-included" codecs? +------------------------------------- + +These codecs live in this repo because they're commonly used, highly supported, +and general-purpose codecs that we recommend for widespread usage in new developments. + +Also, it's just plain nice to have something in-repo for development purposes. +It makes sure that if we try to make any API changes, we immediately see if they'd make codecs harder to implement. +We also use the batteries-included codecs for debugging, for test fixtures, and for benchmarking. + +Further yet, the batteries-included codecs let us offer getting-started APIs. +For example, we offer some helper APIs which use codecs like e.g. JSON to give consumers of the libraries +one-step helper methods that "do the right thing" with zero config... so long as they happen to use that codec. +Even for consumers who don't use those codecs, such functions then serve as natural documentation +and examples for what to do to put their codec of choice to work. diff --git a/codec/api.go b/codec/api.go index 3e41f651..b61c4e07 100644 --- a/codec/api.go +++ b/codec/api.go @@ -42,3 +42,11 @@ type ErrBudgetExhausted struct{} func (e ErrBudgetExhausted) Error() string { return "decoder resource budget exhausted (message too long or too complex)" } + +type MapSortMode uint8 + +const ( + MapSortMode_None MapSortMode = iota + MapSortMode_Lexical + MapSortMode_RFC7049 +) diff --git a/codec/cbor/multicodec.go b/codec/cbor/multicodec.go index da086434..71335de8 100644 --- a/codec/cbor/multicodec.go +++ b/codec/cbor/multicodec.go @@ -3,8 +3,6 @@ package cbor import ( "io" - "github.com/polydawn/refmt/cbor" - "github.com/ipld/go-ipld-prime" "github.com/ipld/go-ipld-prime/codec/dagcbor" "github.com/ipld/go-ipld-prime/multicodec" @@ -20,12 +18,22 @@ func init() { multicodec.RegisterDecoder(0x51, Decode) } +// Decode deserializes data from the given io.Reader and feeds it into the given ipld.NodeAssembler. +// Decode fits the ipld.Decoder function interface. +// +// This is the function that will be registered in the default multicodec registry during package init time. func Decode(na ipld.NodeAssembler, r io.Reader) error { - return dagcbor.Unmarshal(na, cbor.NewDecoder(cbor.DecodeOptions{}, r), - dagcbor.UnmarshalOptions{AllowLinks: false}) + return dagcbor.DecodeOptions{ + AllowLinks: false, + }.Decode(na, r) } +// Encode walks the given ipld.Node and serializes it to the given io.Writer. +// Encode fits the ipld.Encoder function interface. +// +// This is the function that will be registered in the default multicodec registry during package init time. func Encode(n ipld.Node, w io.Writer) error { - return dagcbor.Marshal(n, cbor.NewEncoder(w), - dagcbor.MarshalOptions{AllowLinks: false}) + return dagcbor.EncodeOptions{ + AllowLinks: false, + }.Encode(n, w) } diff --git a/codec/dagcbor/marshal.go b/codec/dagcbor/marshal.go index e008b683..e8ce035c 100644 --- a/codec/dagcbor/marshal.go +++ b/codec/dagcbor/marshal.go @@ -2,12 +2,15 @@ package dagcbor import ( "fmt" + "io" "sort" + "github.com/polydawn/refmt/cbor" "github.com/polydawn/refmt/shared" "github.com/polydawn/refmt/tok" ipld "github.com/ipld/go-ipld-prime" + "github.com/ipld/go-ipld-prime/codec" cidlink "github.com/ipld/go-ipld-prime/linking/cid" ) @@ -15,28 +18,46 @@ import ( // except for the `case ipld.Kind_Link` block, // which is dag-cbor's special sauce for schemafree links. -const ( - MapSortMode_none = iota - MapSortMode_RFC7049 -) - -type MarshalOptions struct { - // If true, allow encoding of Link nodes as CBOR tag(42), otherwise reject - // them as unencodable +// EncodeOptions can be used to customize the behavior of an encoding function. +// The Encode method on this struct fits the ipld.Encoder function interface. +type EncodeOptions struct { + // If true, allow encoding of Link nodes as CBOR tag(42); + // otherwise, reject them as unencodable. AllowLinks bool - // Control the sorting of map keys, MapSortMode_none for no sorting or - // MapSortMode_RFC7049 for length-first bytewise sorting as per RFC7049 and - // DAG-CBOR - MapSortMode int + // Control the sorting of map keys, using one of the `codec.MapSortMode_*` constants. + MapSortMode codec.MapSortMode } -func Marshal(n ipld.Node, sink shared.TokenSink, options MarshalOptions) error { +// Encode walks the given ipld.Node and serializes it to the given io.Writer. +// Encode fits the ipld.Encoder function interface. +// +// The behavior of the encoder can be customized by setting fields in the EncodeOptions struct before calling this method. +func (cfg EncodeOptions) Encode(n ipld.Node, w io.Writer) error { + // Probe for a builtin fast path. Shortcut to that if possible. + type detectFastPath interface { + EncodeDagCbor(io.Writer) error + } + if n2, ok := n.(detectFastPath); ok { + return n2.EncodeDagCbor(w) + } + // Okay, generic inspection path. + return Marshal(n, cbor.NewEncoder(w), cfg) +} + +// Future work: we would like to remove the Marshal function, +// and in particular, stop seeing types from refmt (like shared.TokenSink) be visible. +// Right now, some kinds of configuration (e.g. for whitespace and prettyprint) are only available through interacting with the refmt types; +// we should improve our API so that this can be done with only our own types in this package. + +// Marshal is a deprecated function. +// Please consider switching to EncodeOptions.Encode instead. +func Marshal(n ipld.Node, sink shared.TokenSink, options EncodeOptions) error { var tk tok.Token return marshal(n, &tk, sink, options) } -func marshal(n ipld.Node, tk *tok.Token, sink shared.TokenSink, options MarshalOptions) error { +func marshal(n ipld.Node, tk *tok.Token, sink shared.TokenSink, options EncodeOptions) error { switch n.Kind() { case ipld.Kind_Invalid: return fmt.Errorf("cannot traverse a node that is absent") @@ -138,14 +159,14 @@ func marshal(n ipld.Node, tk *tok.Token, sink shared.TokenSink, options MarshalO } } -func marshalMap(n ipld.Node, tk *tok.Token, sink shared.TokenSink, options MarshalOptions) error { +func marshalMap(n ipld.Node, tk *tok.Token, sink shared.TokenSink, options EncodeOptions) error { // Emit start of map. tk.Type = tok.TMapOpen tk.Length = int(n.Length()) // TODO: overflow check if _, err := sink.Step(tk); err != nil { return err } - if options.MapSortMode == MapSortMode_RFC7049 { + if options.MapSortMode != codec.MapSortMode_None { // Collect map entries, then sort by key type entry struct { key string @@ -163,14 +184,22 @@ func marshalMap(n ipld.Node, tk *tok.Token, sink shared.TokenSink, options Marsh } entries = append(entries, entry{keyStr, v}) } - // RFC7049 style sort as per DAG-CBOR spec - sort.Slice(entries, func(i, j int) bool { - li, lj := len(entries[i].key), len(entries[j].key) - if li == lj { + // Apply the desired sort function. + switch options.MapSortMode { + case codec.MapSortMode_Lexical: + sort.Slice(entries, func(i, j int) bool { return entries[i].key < entries[j].key - } - return li < lj - }) + }) + case codec.MapSortMode_RFC7049: + sort.Slice(entries, func(i, j int) bool { + // RFC7049 style sort as per DAG-CBOR spec + li, lj := len(entries[i].key), len(entries[j].key) + if li == lj { + return entries[i].key < entries[j].key + } + return li < lj + }) + } // Emit map contents (and recurse). for _, e := range entries { tk.Type = tok.TString diff --git a/codec/dagcbor/multicodec.go b/codec/dagcbor/multicodec.go index 05b7c982..c09daf66 100644 --- a/codec/dagcbor/multicodec.go +++ b/codec/dagcbor/multicodec.go @@ -3,9 +3,8 @@ package dagcbor import ( "io" - "github.com/polydawn/refmt/cbor" - "github.com/ipld/go-ipld-prime" + "github.com/ipld/go-ipld-prime/codec" "github.com/ipld/go-ipld-prime/multicodec" ) @@ -19,28 +18,31 @@ func init() { multicodec.RegisterDecoder(0x71, Decode) } +// Decode deserializes data from the given io.Reader and feeds it into the given ipld.NodeAssembler. +// Decode fits the ipld.Decoder function interface. +// +// A similar function is available on DecodeOptions type if you would like to customize any of the decoding details. +// This function uses the defaults for the dag-cbor codec +// (meaning: links (indicated by tag 42) are decoded). +// +// This is the function that will be registered in the default multicodec registry during package init time. func Decode(na ipld.NodeAssembler, r io.Reader) error { - // Probe for a builtin fast path. Shortcut to that if possible. - type detectFastPath interface { - DecodeDagCbor(io.Reader) error - } - if na2, ok := na.(detectFastPath); ok { - return na2.DecodeDagCbor(r) - } - // Okay, generic builder path. - return Unmarshal(na, cbor.NewDecoder(cbor.DecodeOptions{}, r), - UnmarshalOptions{AllowLinks: true}) + return DecodeOptions{ + AllowLinks: true, + }.Decode(na, r) } +// Encode walks the given ipld.Node and serializes it to the given io.Writer. +// Encode fits the ipld.Encoder function interface. +// +// A similar function is available on EncodeOptions type if you would like to customize any of the encoding details. +// This function uses the defaults for the dag-cbor codec +// (meaning: links are encoded, and map keys are sorted (with RFC7049 ordering!) during encode). +// +// This is the function that will be registered in the default multicodec registry during package init time. func Encode(n ipld.Node, w io.Writer) error { - // Probe for a builtin fast path. Shortcut to that if possible. - type detectFastPath interface { - EncodeDagCbor(io.Writer) error - } - if n2, ok := n.(detectFastPath); ok { - return n2.EncodeDagCbor(w) - } - // Okay, generic inspection path. - return Marshal(n, cbor.NewEncoder(w), - MarshalOptions{AllowLinks: true, MapSortMode: MapSortMode_RFC7049}) + return EncodeOptions{ + AllowLinks: true, + MapSortMode: codec.MapSortMode_RFC7049, + }.Encode(n, w) } diff --git a/codec/dagcbor/unmarshal.go b/codec/dagcbor/unmarshal.go index cf17cd7f..b4bef240 100644 --- a/codec/dagcbor/unmarshal.go +++ b/codec/dagcbor/unmarshal.go @@ -3,9 +3,11 @@ package dagcbor import ( "errors" "fmt" + "io" "math" cid "github.com/ipfs/go-cid" + "github.com/polydawn/refmt/cbor" "github.com/polydawn/refmt/shared" "github.com/polydawn/refmt/tok" @@ -27,12 +29,37 @@ const ( // except for the `case tok.TBytes` block, // which has dag-cbor's special sauce for detecting schemafree links. -type UnmarshalOptions struct { +// DecodeOptions can be used to customize the behavior of a decoding function. +// The Decode method on this struct fits the ipld.Decoder function interface. +type DecodeOptions struct { // If true, parse DAG-CBOR tag(42) as Link nodes, otherwise reject them AllowLinks bool } -func Unmarshal(na ipld.NodeAssembler, tokSrc shared.TokenSource, options UnmarshalOptions) error { +// Decode deserializes data from the given io.Reader and feeds it into the given ipld.NodeAssembler. +// Decode fits the ipld.Decoder function interface. +// +// The behavior of the decoder can be customized by setting fields in the DecodeOptions struct before calling this method. +func (cfg DecodeOptions) Decode(na ipld.NodeAssembler, r io.Reader) error { + // Probe for a builtin fast path. Shortcut to that if possible. + type detectFastPath interface { + DecodeDagCbor(io.Reader) error + } + if na2, ok := na.(detectFastPath); ok { + return na2.DecodeDagCbor(r) + } + // Okay, generic builder path. + return Unmarshal(na, cbor.NewDecoder(cbor.DecodeOptions{}, r), cfg) +} + +// Future work: we would like to remove the Unmarshal function, +// and in particular, stop seeing types from refmt (like shared.TokenSource) be visible. +// Right now, some kinds of configuration (e.g. for whitespace and prettyprint) are only available through interacting with the refmt types; +// we should improve our API so that this can be done with only our own types in this package. + +// Unmarshal is a deprecated function. +// Please consider switching to DecodeOptions.Decode instead. +func Unmarshal(na ipld.NodeAssembler, tokSrc shared.TokenSource, options DecodeOptions) error { // Have a gas budget, which will be decremented as we allocate memory, and an error returned when execeeded (or about to be exceeded). // This is a DoS defense mechanism. // It's *roughly* in units of bytes (but only very, VERY roughly) -- it also treats words as 1 in many cases. @@ -41,7 +68,7 @@ func Unmarshal(na ipld.NodeAssembler, tokSrc shared.TokenSource, options Unmarsh return unmarshal1(na, tokSrc, &gas, options) } -func unmarshal1(na ipld.NodeAssembler, tokSrc shared.TokenSource, gas *int, options UnmarshalOptions) error { +func unmarshal1(na ipld.NodeAssembler, tokSrc shared.TokenSource, gas *int, options DecodeOptions) error { var tk tok.Token done, err := tokSrc.Step(&tk) if err != nil { @@ -55,7 +82,7 @@ func unmarshal1(na ipld.NodeAssembler, tokSrc shared.TokenSource, gas *int, opti // starts with the first token already primed. Necessary to get recursion // to flow right without a peek+unpeek system. -func unmarshal2(na ipld.NodeAssembler, tokSrc shared.TokenSource, tk *tok.Token, gas *int, options UnmarshalOptions) error { +func unmarshal2(na ipld.NodeAssembler, tokSrc shared.TokenSource, tk *tok.Token, gas *int, options DecodeOptions) error { // FUTURE: check for schema.TypedNodeBuilder that's going to parse a Link (they can slurp any token kind they want). switch tk.Type { case tok.TMapOpen: diff --git a/codec/dagjson/marshal.go b/codec/dagjson/marshal.go index 644d38cb..371e8e1f 100644 --- a/codec/dagjson/marshal.go +++ b/codec/dagjson/marshal.go @@ -3,12 +3,15 @@ package dagjson import ( "encoding/base64" "fmt" + "io" "sort" + "github.com/polydawn/refmt/json" "github.com/polydawn/refmt/shared" "github.com/polydawn/refmt/tok" ipld "github.com/ipld/go-ipld-prime" + "github.com/ipld/go-ipld-prime/codec" cidlink "github.com/ipld/go-ipld-prime/linking/cid" ) @@ -16,7 +19,9 @@ import ( // except for the `case ipld.Kind_Link` block, // which is dag-json's special sauce for schemafree links. -type MarshalOptions struct { +// EncodeOptions can be used to customize the behavior of an encoding function. +// The Encode method on this struct fits the ipld.Encoder function interface. +type EncodeOptions struct { // If true, will encode nodes with a Link kind using the DAG-JSON // `{"/":"cid string"}` form. EncodeLinks bool @@ -25,12 +30,26 @@ type MarshalOptions struct { // `{"/":{"bytes":"base64 bytes..."}}` form. EncodeBytes bool - // If true, will sort map keys prior to encoding using plain bytewise - // comparison. - SortMapKeys bool + // Control the sorting of map keys, using one of the `codec.MapSortMode_*` constants. + MapSortMode codec.MapSortMode } -func Marshal(n ipld.Node, sink shared.TokenSink, options MarshalOptions) error { +// Encode walks the given ipld.Node and serializes it to the given io.Writer. +// Encode fits the ipld.Encoder function interface. +// +// The behavior of the encoder can be customized by setting fields in the EncodeOptions struct before calling this method. +func (cfg EncodeOptions) Encode(n ipld.Node, w io.Writer) error { + return Marshal(n, json.NewEncoder(w, json.EncodeOptions{}), cfg) +} + +// Future work: we would like to remove the Marshal function, +// and in particular, stop seeing types from refmt (like shared.TokenSink) be visible. +// Right now, some kinds of configuration (e.g. for whitespace and prettyprint) are only available through interacting with the refmt types; +// we should improve our API so that this can be done with only our own types in this package. + +// Marshal is a deprecated function. +// Please consider switching to EncodeOptions.Encode instead. +func Marshal(n ipld.Node, sink shared.TokenSink, options EncodeOptions) error { var tk tok.Token switch n.Kind() { case ipld.Kind_Invalid: @@ -46,7 +65,7 @@ func Marshal(n ipld.Node, sink shared.TokenSink, options MarshalOptions) error { if _, err := sink.Step(&tk); err != nil { return err } - if options.SortMapKeys { + if options.MapSortMode != codec.MapSortMode_None { // Collect map entries, then sort by key type entry struct { key string @@ -64,7 +83,22 @@ func Marshal(n ipld.Node, sink shared.TokenSink, options MarshalOptions) error { } entries = append(entries, entry{keyStr, v}) } - sort.Slice(entries, func(i, j int) bool { return entries[i].key < entries[j].key }) + // Apply the desired sort function. + switch options.MapSortMode { + case codec.MapSortMode_Lexical: + sort.Slice(entries, func(i, j int) bool { + return entries[i].key < entries[j].key + }) + case codec.MapSortMode_RFC7049: + sort.Slice(entries, func(i, j int) bool { + // RFC7049 style sort as per DAG-CBOR spec + li, lj := len(entries[i].key), len(entries[j].key) + if li == lj { + return entries[i].key < entries[j].key + } + return li < lj + }) + } // Emit map contents (and recurse). for _, e := range entries { tk.Type = tok.TString diff --git a/codec/dagjson/multicodec.go b/codec/dagjson/multicodec.go index 232d5fe9..d2931eb0 100644 --- a/codec/dagjson/multicodec.go +++ b/codec/dagjson/multicodec.go @@ -1,12 +1,10 @@ package dagjson import ( - "fmt" "io" - "github.com/polydawn/refmt/json" - "github.com/ipld/go-ipld-prime" + "github.com/ipld/go-ipld-prime/codec" "github.com/ipld/go-ipld-prime/multicodec" ) @@ -20,46 +18,33 @@ func init() { multicodec.RegisterDecoder(0x0129, Decode) } +// Decode deserializes data from the given io.Reader and feeds it into the given ipld.NodeAssembler. +// Decode fits the ipld.Decoder function interface. +// +// A similar function is available on DecodeOptions type if you would like to customize any of the decoding details. +// This function uses the defaults for the dag-json codec +// (meaning: links are decoded, and bytes are decoded). +// +// This is the function that will be registered in the default multicodec registry during package init time. func Decode(na ipld.NodeAssembler, r io.Reader) error { - err := Unmarshal(na, json.NewDecoder(r), UnmarshalOptions{ + return DecodeOptions{ ParseLinks: true, ParseBytes: true, - }) - if err != nil { - return err - } - // Slurp any remaining whitespace. - // (This is relevant if our reader is tee'ing bytes to a hasher, and - // the json contained any trailing whitespace.) - // (We can't actually support multiple objects per reader from here; - // we can't unpeek if we find a non-whitespace token, so our only - // option is to error if this reader seems to contain more content.) - var buf [1]byte - for { - _, err := r.Read(buf[:]) - switch buf[0] { - case ' ', 0x0, '\t', '\r', '\n': // continue - default: - return fmt.Errorf("unexpected content after end of json object") - } - if err == nil { - continue - } else if err == io.EOF { - return nil - } else { - return err - } - } + }.Decode(na, r) } +// Encode walks the given ipld.Node and serializes it to the given io.Writer. +// Encode fits the ipld.Encoder function interface. +// +// A similar function is available on EncodeOptions type if you would like to customize any of the encoding details. +// This function uses the defaults for the dag-json codec +// (meaning: links are encoded, bytes are encoded, and map keys are sorted during encode). +// +// This is the function that will be registered in the default multicodec registry during package init time. func Encode(n ipld.Node, w io.Writer) error { - // Shell out directly to generic inspection path. - // (There's not really any fastpaths of note for json.) - // Write another function if you need to tune encoding options about whitespace. - return Marshal(n, json.NewEncoder(w, json.EncodeOptions{}), - MarshalOptions{ - EncodeLinks: true, - EncodeBytes: true, - SortMapKeys: true, - }) + return EncodeOptions{ + EncodeLinks: true, + EncodeBytes: true, + MapSortMode: codec.MapSortMode_Lexical, + }.Encode(n, w) } diff --git a/codec/dagjson/unmarshal.go b/codec/dagjson/unmarshal.go index 93cab58b..2dcdeff4 100644 --- a/codec/dagjson/unmarshal.go +++ b/codec/dagjson/unmarshal.go @@ -3,8 +3,10 @@ package dagjson import ( "encoding/base64" "fmt" + "io" cid "github.com/ipfs/go-cid" + "github.com/polydawn/refmt/json" "github.com/polydawn/refmt/shared" "github.com/polydawn/refmt/tok" @@ -20,7 +22,9 @@ import ( // several steps of handling maps, because it necessitates peeking several // tokens before deciding what kind of value to create). -type UnmarshalOptions struct { +// DecodeOptions can be used to customize the behavior of a decoding function. +// The Decode method on this struct fits the ipld.Decoder function interface. +type DecodeOptions struct { // If true, parse DAG-JSON `{"/":"cid string"}` as a Link kind node rather // than a plain map ParseLinks bool @@ -30,7 +34,48 @@ type UnmarshalOptions struct { ParseBytes bool } -func Unmarshal(na ipld.NodeAssembler, tokSrc shared.TokenSource, options UnmarshalOptions) error { +// Decode deserializes data from the given io.Reader and feeds it into the given ipld.NodeAssembler. +// Decode fits the ipld.Decoder function interface. +// +// The behavior of the decoder can be customized by setting fields in the DecodeOptions struct before calling this method. +func (cfg DecodeOptions) Decode(na ipld.NodeAssembler, r io.Reader) error { + err := Unmarshal(na, json.NewDecoder(r), cfg) + if err != nil { + return err + } + // Slurp any remaining whitespace. + // This behavior may be due for review. + // (This is relevant if our reader is tee'ing bytes to a hasher, and + // the json contained any trailing whitespace.) + // (We can't actually support multiple objects per reader from here; + // we can't unpeek if we find a non-whitespace token, so our only + // option is to error if this reader seems to contain more content.) + var buf [1]byte + for { + _, err := r.Read(buf[:]) + switch buf[0] { + case ' ', 0x0, '\t', '\r', '\n': // continue + default: + return fmt.Errorf("unexpected content after end of json object") + } + if err == nil { + continue + } else if err == io.EOF { + return nil + } else { + return err + } + } +} + +// Future work: we would like to remove the Unmarshal function, +// and in particular, stop seeing types from refmt (like shared.TokenSource) be visible. +// Right now, some kinds of configuration (e.g. for whitespace and prettyprint) are only available through interacting with the refmt types; +// we should improve our API so that this can be done with only our own types in this package. + +// Unmarshal is a deprecated function. +// Please consider switching to DecodeOptions.Decode instead. +func Unmarshal(na ipld.NodeAssembler, tokSrc shared.TokenSource, options DecodeOptions) error { var st unmarshalState st.options = options done, err := tokSrc.Step(&st.tk[0]) @@ -46,7 +91,7 @@ func Unmarshal(na ipld.NodeAssembler, tokSrc shared.TokenSource, options Unmarsh type unmarshalState struct { tk [7]tok.Token // mostly, only 0'th is used... but [1:7] are used during lookahead for links. shift int // how many times to slide something out of tk[1:7] instead of getting a new token. - options UnmarshalOptions + options DecodeOptions } // step leaves a "new" token in tk[0], diff --git a/codec/json/multicodec.go b/codec/json/multicodec.go index 9f2f8c7c..fb58578f 100644 --- a/codec/json/multicodec.go +++ b/codec/json/multicodec.go @@ -1,12 +1,12 @@ package json import ( - "fmt" "io" rfmtjson "github.com/polydawn/refmt/json" "github.com/ipld/go-ipld-prime" + "github.com/ipld/go-ipld-prime/codec" "github.com/ipld/go-ipld-prime/codec/dagjson" "github.com/ipld/go-ipld-prime/multicodec" ) @@ -21,40 +21,21 @@ func init() { multicodec.RegisterDecoder(0x0200, Decode) } +// Decode deserializes data from the given io.Reader and feeds it into the given ipld.NodeAssembler. +// Decode fits the ipld.Decoder function interface. +// +// This is the function that will be registered in the default multicodec registry during package init time. func Decode(na ipld.NodeAssembler, r io.Reader) error { - // Shell out directly to generic builder path. - // (There's not really any fastpaths of note for json.) - err := dagjson.Unmarshal(na, rfmtjson.NewDecoder(r), dagjson.UnmarshalOptions{ + return dagjson.DecodeOptions{ ParseLinks: false, ParseBytes: false, - }) - if err != nil { - return err - } - // Slurp any remaining whitespace. - // (This is relevant if our reader is tee'ing bytes to a hasher, and - // the json contained any trailing whitespace.) - // (We can't actually support multiple objects per reader from here; - // we can't unpeek if we find a non-whitespace token, so our only - // option is to error if this reader seems to contain more content.) - var buf [1]byte - for { - _, err := r.Read(buf[:]) - switch buf[0] { - case ' ', 0x0, '\t', '\r', '\n': // continue - default: - return fmt.Errorf("unexpected content after end of json object") - } - if err == nil { - continue - } else if err == io.EOF { - return nil - } else { - return err - } - } + }.Decode(na, r) } +// Encode walks the given ipld.Node and serializes it to the given io.Writer. +// Encode fits the ipld.Encoder function interface. +// +// This is the function that will be registered in the default multicodec registry during package init time. func Encode(n ipld.Node, w io.Writer) error { // Shell out directly to generic inspection path. // (There's not really any fastpaths of note for json.) @@ -62,9 +43,9 @@ func Encode(n ipld.Node, w io.Writer) error { return dagjson.Marshal(n, rfmtjson.NewEncoder(w, rfmtjson.EncodeOptions{ Line: []byte{'\n'}, Indent: []byte{'\t'}, - }), dagjson.MarshalOptions{ + }), dagjson.EncodeOptions{ EncodeLinks: false, EncodeBytes: false, - SortMapKeys: false, + MapSortMode: codec.MapSortMode_None, }) } diff --git a/codec/marshal.go b/codec/marshal.go index 2b2eb5c6..4dc9f846 100644 --- a/codec/marshal.go +++ b/codec/marshal.go @@ -9,6 +9,12 @@ import ( ipld "github.com/ipld/go-ipld-prime" ) +// Future work: we would like to remove the Marshal function, +// and in particular, stop seeing types from refmt (like shared.TokenSink) be visible. + +// Marshal is a deprecated function. +// Please consider switching to one of the Encode functions of one of the subpackages instead. +// // Marshal provides a very general node-to-tokens marshalling feature. // It can handle either cbor or json by being combined with a refmt TokenSink. // diff --git a/codec/unmarshal.go b/codec/unmarshal.go index 51f7ac80..99d4cc3f 100644 --- a/codec/unmarshal.go +++ b/codec/unmarshal.go @@ -10,6 +10,9 @@ import ( ipld "github.com/ipld/go-ipld-prime" ) +// Future work: we would like to remove the Unmarshal function, +// and in particular, stop seeing types from refmt (like shared.TokenSource) be visible. + // wishlist: if we could reconstruct the ipld.Path of an error while // *unwinding* from that error... that'd be nice. // (trying to build it proactively would waste tons of allocs on the happy path.) @@ -19,6 +22,9 @@ import ( // They're effectively doing double duty: testing the builders, too. // (Is that sensible? Should it be refactored? Not sure; maybe!) +// Unmarshal is a deprecated function. +// Please consider switching to one of the Decode functions of one of the subpackages instead. +// // Unmarshal provides a very general tokens-to-node unmarshalling feature. // It can handle either cbor or json by being combined with a refmt TokenSink. // diff --git a/node/tests/testcase.go b/node/tests/testcase.go index 63648f87..5ccf1cba 100644 --- a/node/tests/testcase.go +++ b/node/tests/testcase.go @@ -11,6 +11,7 @@ import ( . "github.com/warpfork/go-wish" "github.com/ipld/go-ipld-prime" + "github.com/ipld/go-ipld-prime/codec" "github.com/ipld/go-ipld-prime/codec/dagjson" "github.com/ipld/go-ipld-prime/schema" "github.com/ipld/go-ipld-prime/traversal" @@ -210,10 +211,10 @@ func testMarshal(t *testing.T, n ipld.Node, data string) { // We'll marshal with "pretty" linebreaks and indents (and re-format the fixture to the same) for better diffing. prettyprint := json.EncodeOptions{Line: []byte{'\n'}, Indent: []byte{'\t'}} var buf bytes.Buffer - err := dagjson.Marshal(n, json.NewEncoder(&buf, prettyprint), dagjson.MarshalOptions{ + err := dagjson.Marshal(n, json.NewEncoder(&buf, prettyprint), dagjson.EncodeOptions{ EncodeLinks: true, EncodeBytes: true, - SortMapKeys: true, + MapSortMode: codec.MapSortMode_Lexical, }) if err != nil { t.Errorf("marshal failed: %s", err)