Skip to content

Commit a9c260b

Browse files
dboehm-avalabsDarioush Jalali
andauthored
Merkle db Make Paths only refer to lists of nodes (#2143)
Signed-off-by: David Boehm <91908103+dboehm-avalabs@users.noreply.github.com> Co-authored-by: Darioush Jalali <darioush.jalali@avalabs.org>
1 parent 0faab95 commit a9c260b

27 files changed

+863
-859
lines changed

proto/pb/sync/sync.pb.go

Lines changed: 81 additions & 81 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

proto/sync/sync.proto

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ message RangeProof {
139139
}
140140

141141
message ProofNode {
142-
Path key = 1;
142+
Key key = 1;
143143
MaybeBytes value_or_hash = 2;
144144
map<uint32, bytes> children = 3;
145145
}
@@ -149,7 +149,7 @@ message KeyChange {
149149
MaybeBytes value = 2;
150150
}
151151

152-
message Path {
152+
message Key {
153153
uint64 length = 1;
154154
bytes value = 2;
155155
}

x/merkledb/README.md

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@ To reduce the depth of nodes in the trie, a `Merkle Node` utilizes path compress
2121
| Merkle Node |
2222
| |
2323
| ID: 0x0131 | an id representing the current node, derived from the node's value and all children ids
24-
| Key: 0x91 | prefix of the key path, representing the location of the node in the trie
25-
| Value: 0x00 | the value, if one exists, that is stored at the key path (pathPrefix + compressedPath)
26-
| Children: | a map of children node ids for any nodes in the trie that have this node's key path as a prefix
24+
| Key: 0x91 | prefix of the key, representing the location of the node in the trie
25+
| Value: 0x00 | the value, if one exists, that is stored at the key (keyPrefix + compressedKey)
26+
| Children: | a map of children node ids for any nodes in the trie that have this node's key as a prefix
2727
| 0: [:0x00542F] | child 0 represents a node with key 0x910 with ID 0x00542F
2828
| 1: [0x432:0xA0561C] | child 1 represents a node with key 0x911432 with ID 0xA0561C
2929
| ... |
@@ -52,19 +52,19 @@ The node serialization format is as follows:
5252
+----------------------------------------------------+
5353
| Child index (varint) |
5454
+----------------------------------------------------+
55-
| Child compressed path length (varint) |
55+
| Child compressed key length (varint) |
5656
+----------------------------------------------------+
57-
| Child compressed path (variable length bytes) |
57+
| Child compressed key (variable length bytes) |
5858
+----------------------------------------------------+
5959
| Child ID (32 bytes) |
6060
+----------------------------------------------------+
6161
| Child has value (1 bytes) |
6262
+----------------------------------------------------+
6363
| Child index (varint) |
6464
+----------------------------------------------------+
65-
| Child compressed path length (varint) |
65+
| Child compressed key length (varint) |
6666
+----------------------------------------------------+
67-
| Child compressed path (variable length bytes) |
67+
| Child compressed key (variable length bytes) |
6868
+----------------------------------------------------+
6969
| Child ID (32 bytes) |
7070
+----------------------------------------------------+
@@ -80,8 +80,8 @@ Where:
8080
* `Value` is the value, if it exists (i.e. if `Value existince flag` is `1`.) Otherwise not serialized.
8181
* `Number of children` is the number of children this node has.
8282
* `Child index` is the index of a child node within the list of the node's children.
83-
* `Child compressed path length` is the length of the child node's compressed path.
84-
* `Child compressed path` is the child node's compressed path.
83+
* `Child compressed key length` is the length of the child node's compressed key.
84+
* `Child compressed key` is the child node's compressed key.
8585
* `Child ID` is the child node's ID.
8686
* `Child has value` indicates if that child has a value.
8787

@@ -91,9 +91,9 @@ For each child of the node, we have an additional:
9191
+----------------------------------------------------+
9292
| Child index (varint) |
9393
+----------------------------------------------------+
94-
| Child compressed path length (varint) |
94+
| Child compressed key length (varint) |
9595
+----------------------------------------------------+
96-
| Child compressed path (variable length bytes) |
96+
| Child compressed key (variable length bytes) |
9797
+----------------------------------------------------+
9898
| Child ID (32 bytes) |
9999
+----------------------------------------------------+
@@ -114,8 +114,8 @@ Its byte representation (in hex) is: `0x01020204000210579EB3718A7E437D2DDCE931AC
114114

115115
The node's key is empty (its the root) and has value `0x02`.
116116
It has two children.
117-
The first is at child index `0`, has compressed path `0x01` and ID (in hex) `0x579eb3718a7e437d2ddce931ac7cc05a0bc695a9c2084f5df12fb96ad0fa3266`.
118-
The second is at child index `14`, has compressed path `0x0F0F0F` and ID (in hex) `0x9845893c4f9d92c4e097fcf2589bc9d6882b1f18d1c2fc91d7df1d3fcbdb4238`.
117+
The first is at child index `0`, has compressed key `0x01` and ID (in hex) `0x579eb3718a7e437d2ddce931ac7cc05a0bc695a9c2084f5df12fb96ad0fa3266`.
118+
The second is at child index `14`, has compressed key `0x0F0F0F` and ID (in hex) `0x9845893c4f9d92c4e097fcf2589bc9d6882b1f18d1c2fc91d7df1d3fcbdb4238`.
119119

120120
```
121121
+--------------------------------------------------------------------+
@@ -134,10 +134,10 @@ The second is at child index `14`, has compressed path `0x0F0F0F` and ID (in hex
134134
| Child index (varint) |
135135
| 0x00 |
136136
+--------------------------------------------------------------------+
137-
| Child compressed path length (varint) |
137+
| Child compressed key length (varint) |
138138
| 0x02 |
139139
+--------------------------------------------------------------------+
140-
| Child compressed path (variable length bytes) |
140+
| Child compressed key (variable length bytes) |
141141
| 0x10 |
142142
+--------------------------------------------------------------------+
143143
| Child ID (32 bytes) |
@@ -146,10 +146,10 @@ The second is at child index `14`, has compressed path `0x0F0F0F` and ID (in hex
146146
| Child index (varint) |
147147
| 0x0E |
148148
+--------------------------------------------------------------------+
149-
| Child compressed path length (varint) |
149+
| Child compressed key length (varint) |
150150
| 0x06 |
151151
+--------------------------------------------------------------------+
152-
| Child compressed path (variable length bytes) |
152+
| Child compressed key (variable length bytes) |
153153
| 0xFFF0 |
154154
+--------------------------------------------------------------------+
155155
| Child ID (32 bytes) |
@@ -204,7 +204,7 @@ Where:
204204

205205
Note that, as with the node serialization format, the `Child index` values aren't necessarily sequential, but they are unique and strictly increasing.
206206
Also like the node serialization format, there can be up to 16 blocks of children data.
207-
However, note that child compressed paths are not included in the node ID calculation.
207+
However, note that child compressed keys are not included in the node ID calculation.
208208

209209
Once this is encoded, we `sha256` hash the resulting bytes to get the node's ID.
210210

@@ -227,7 +227,7 @@ By splitting the nodes up by value, it allows better key/value iteration and a m
227227

228228
### Single node type
229229

230-
A `Merkle Node` holds the IDs of its children, its value, as well as any path extension. This simplifies some logic and allows all of the data about a node to be loaded in a single database read. This trades off a small amount of storage efficiency (some fields may be `nil` but are still stored for every node).
230+
A `Merkle Node` holds the IDs of its children, its value, as well as any key extension. This simplifies some logic and allows all of the data about a node to be loaded in a single database read. This trades off a small amount of storage efficiency (some fields may be `nil` but are still stored for every node).
231231

232232
### Validity
233233

x/merkledb/codec.go

Lines changed: 32 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,16 @@ const (
2121
falseByte = 0
2222
minVarIntLen = 1
2323
minMaybeByteSliceLen = boolLen
24-
minPathLen = minVarIntLen
24+
minKeyLen = minVarIntLen
2525
minByteSliceLen = minVarIntLen
2626
minDBNodeLen = minMaybeByteSliceLen + minVarIntLen
27-
minChildLen = minVarIntLen + minPathLen + ids.IDLen + boolLen
27+
minChildLen = minVarIntLen + minKeyLen + ids.IDLen + boolLen
2828

29-
estimatedKeyLen = 64
30-
estimatedValueLen = 64
31-
estimatedCompressedPathLen = 8
32-
// Child index, child compressed path, child ID, child has value
33-
estimatedNodeChildLen = minVarIntLen + estimatedCompressedPathLen + ids.IDLen + boolLen
29+
estimatedKeyLen = 64
30+
estimatedValueLen = 64
31+
estimatedCompressedKeyLen = 8
32+
// Child index, child compressed key, child ID, child has value
33+
estimatedNodeChildLen = minVarIntLen + estimatedCompressedKeyLen + ids.IDLen + boolLen
3434
// Child index, child ID
3535
hashValuesChildLen = minVarIntLen + ids.IDLen
3636
)
@@ -45,7 +45,7 @@ var (
4545
errChildIndexTooLarge = errors.New("invalid child index. Must be less than branching factor")
4646
errLeadingZeroes = errors.New("varint has leading zeroes")
4747
errInvalidBool = errors.New("decoded bool is neither true nor false")
48-
errNonZeroPathPadding = errors.New("path partial byte should be padded with 0s")
48+
errNonZeroKeyPadding = errors.New("key partial byte should be padded with 0s")
4949
errExtraSpace = errors.New("trailing buffer space")
5050
errIntOverflow = errors.New("value overflows int")
5151
)
@@ -102,7 +102,7 @@ func (c *codecImpl) encodeDBNode(n *dbNode, branchFactor BranchFactor) []byte {
102102
for index := 0; BranchFactor(index) < branchFactor; index++ {
103103
if entry, ok := n.children[byte(index)]; ok {
104104
c.encodeUint(buf, uint64(index))
105-
c.encodePath(buf, entry.compressedPath)
105+
c.encodeKey(buf, entry.compressedKey)
106106
_, _ = buf.Write(entry.id[:])
107107
c.encodeBool(buf, entry.hasValue)
108108
}
@@ -128,7 +128,7 @@ func (c *codecImpl) encodeHashValues(hv *hashValues) []byte {
128128
}
129129
}
130130
c.encodeMaybeByteSlice(buf, hv.Value)
131-
c.encodePath(buf, hv.Key)
131+
c.encodeKey(buf, hv.Key)
132132

133133
return buf.Bytes()
134134
}
@@ -168,7 +168,7 @@ func (c *codecImpl) decodeDBNode(b []byte, n *dbNode, branchFactor BranchFactor)
168168
}
169169
previousChild = index
170170

171-
compressedPath, err := c.decodePath(src, branchFactor)
171+
compressedKey, err := c.decodeKey(src, branchFactor)
172172
if err != nil {
173173
return err
174174
}
@@ -181,9 +181,9 @@ func (c *codecImpl) decodeDBNode(b []byte, n *dbNode, branchFactor BranchFactor)
181181
return err
182182
}
183183
n.children[byte(index)] = child{
184-
compressedPath: compressedPath,
185-
id: childID,
186-
hasValue: hasValue,
184+
compressedKey: compressedKey,
185+
id: childID,
186+
hasValue: hasValue,
187187
}
188188
}
189189
if src.Len() != 0 {
@@ -326,43 +326,43 @@ func (*codecImpl) decodeID(src *bytes.Reader) (ids.ID, error) {
326326
return id, err
327327
}
328328

329-
func (c *codecImpl) encodePath(dst *bytes.Buffer, p Path) {
330-
c.encodeUint(dst, uint64(p.tokensLength))
331-
_, _ = dst.Write(p.Bytes())
329+
func (c *codecImpl) encodeKey(dst *bytes.Buffer, key Key) {
330+
c.encodeUint(dst, uint64(key.tokenLength))
331+
_, _ = dst.Write(key.Bytes())
332332
}
333333

334-
func (c *codecImpl) decodePath(src *bytes.Reader, branchFactor BranchFactor) (Path, error) {
335-
if minPathLen > src.Len() {
336-
return Path{}, io.ErrUnexpectedEOF
334+
func (c *codecImpl) decodeKey(src *bytes.Reader, branchFactor BranchFactor) (Key, error) {
335+
if minKeyLen > src.Len() {
336+
return Key{}, io.ErrUnexpectedEOF
337337
}
338338

339339
length, err := c.decodeUint(src)
340340
if err != nil {
341-
return Path{}, err
341+
return Key{}, err
342342
}
343343
if length > math.MaxInt {
344-
return Path{}, errIntOverflow
344+
return Key{}, errIntOverflow
345345
}
346-
result := emptyPath(branchFactor)
347-
result.tokensLength = int(length)
348-
pathBytesLen := result.bytesNeeded(result.tokensLength)
349-
if pathBytesLen > src.Len() {
350-
return Path{}, io.ErrUnexpectedEOF
346+
result := emptyKey(branchFactor)
347+
result.tokenLength = int(length)
348+
keyBytesLen := result.bytesNeeded(result.tokenLength)
349+
if keyBytesLen > src.Len() {
350+
return Key{}, io.ErrUnexpectedEOF
351351
}
352-
buffer := make([]byte, pathBytesLen)
352+
buffer := make([]byte, keyBytesLen)
353353
if _, err := io.ReadFull(src, buffer); err != nil {
354354
if err == io.EOF {
355355
err = io.ErrUnexpectedEOF
356356
}
357-
return Path{}, err
357+
return Key{}, err
358358
}
359359
if result.hasPartialByte() {
360360
// Confirm that the padding bits in the partial byte are 0.
361361
// We want to only look at the bits to the right of the last token, which is at index length-1.
362362
// Generate a mask with (8-bitsToShift) 0s followed by bitsToShift 1s.
363-
paddingMask := byte(0xFF >> (8 - result.bitsToShift(result.tokensLength-1)))
364-
if buffer[pathBytesLen-1]&paddingMask != 0 {
365-
return Path{}, errNonZeroPathPadding
363+
paddingMask := byte(0xFF >> (8 - result.bitsToShift(result.tokenLength-1)))
364+
if buffer[keyBytesLen-1]&paddingMask != 0 {
365+
return Key{}, errNonZeroKeyPadding
366366
}
367367
}
368368
result.value = string(buffer)

x/merkledb/codec_test.go

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ func FuzzCodecInt(f *testing.F) {
7373
)
7474
}
7575

76-
func FuzzCodecPath(f *testing.F) {
76+
func FuzzCodecKey(f *testing.F) {
7777
f.Fuzz(
7878
func(
7979
t *testing.T,
@@ -84,7 +84,7 @@ func FuzzCodecPath(f *testing.F) {
8484
codec := codec.(*codecImpl)
8585
reader := bytes.NewReader(b)
8686
startLen := reader.Len()
87-
got, err := codec.decodePath(reader, branchFactor)
87+
got, err := codec.decodeKey(reader, branchFactor)
8888
if err != nil {
8989
t.SkipNow()
9090
}
@@ -93,7 +93,7 @@ func FuzzCodecPath(f *testing.F) {
9393

9494
// Encoding [got] should be the same as [b].
9595
var buf bytes.Buffer
96-
codec.encodePath(&buf, got)
96+
codec.encodeKey(&buf, got)
9797
bufBytes := buf.Bytes()
9898
require.Len(bufBytes, numRead)
9999
require.Equal(b[:numRead], bufBytes)
@@ -155,12 +155,12 @@ func FuzzCodecDBNodeDeterministic(f *testing.F) {
155155
var childID ids.ID
156156
_, _ = r.Read(childID[:]) // #nosec G404
157157

158-
childPathBytes := make([]byte, r.Intn(32)) // #nosec G404
159-
_, _ = r.Read(childPathBytes) // #nosec G404
158+
childKeyBytes := make([]byte, r.Intn(32)) // #nosec G404
159+
_, _ = r.Read(childKeyBytes) // #nosec G404
160160

161161
children[byte(i)] = child{
162-
compressedPath: NewPath(childPathBytes, branchFactor),
163-
id: childID,
162+
compressedKey: ToKey(childKeyBytes, branchFactor),
163+
id: childID,
164164
}
165165
}
166166
node := dbNode{
@@ -225,14 +225,14 @@ func FuzzEncodeHashValues(f *testing.F) {
225225
children := map[byte]child{}
226226
numChildren := r.Intn(int(branchFactor)) // #nosec G404
227227
for i := 0; i < numChildren; i++ {
228-
compressedPathLen := r.Intn(32) // #nosec G404
229-
compressedPathBytes := make([]byte, compressedPathLen)
230-
_, _ = r.Read(compressedPathBytes) // #nosec G404
228+
compressedKeyLen := r.Intn(32) // #nosec G404
229+
compressedKeyBytes := make([]byte, compressedKeyLen)
230+
_, _ = r.Read(compressedKeyBytes) // #nosec G404
231231

232232
children[byte(i)] = child{
233-
compressedPath: NewPath(compressedPathBytes, branchFactor),
234-
id: ids.GenerateTestID(),
235-
hasValue: r.Intn(2) == 1, // #nosec G404
233+
compressedKey: ToKey(compressedKeyBytes, branchFactor),
234+
id: ids.GenerateTestID(),
235+
hasValue: r.Intn(2) == 1, // #nosec G404
236236
}
237237
}
238238

@@ -250,7 +250,7 @@ func FuzzEncodeHashValues(f *testing.F) {
250250
hv := &hashValues{
251251
Children: children,
252252
Value: value,
253-
Key: NewPath(key, branchFactor),
253+
Key: ToKey(key, branchFactor),
254254
}
255255

256256
// Serialize the *hashValues with both codecs
@@ -264,9 +264,9 @@ func FuzzEncodeHashValues(f *testing.F) {
264264
)
265265
}
266266

267-
func TestCodecDecodePathLengthOverflowRegression(t *testing.T) {
267+
func TestCodecDecodeKeyLengthOverflowRegression(t *testing.T) {
268268
codec := codec.(*codecImpl)
269269
bytes := bytes.NewReader(binary.AppendUvarint(nil, math.MaxInt))
270-
_, err := codec.decodePath(bytes, BranchFactor16)
270+
_, err := codec.decodeKey(bytes, BranchFactor16)
271271
require.ErrorIs(t, err, io.ErrUnexpectedEOF)
272272
}

0 commit comments

Comments
 (0)