Skip to content
This repository was archived by the owner on Jun 27, 2023. It is now read-only.

Commit 51ba1e2

Browse files
committed
feat: switch to HAMT based on size
1 parent 28e86c5 commit 51ba1e2

File tree

3 files changed

+196
-42
lines changed

3 files changed

+196
-42
lines changed

io/directory.go

Lines changed: 94 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,13 @@ import (
1414
ipld "github.com/ipfs/go-ipld-format"
1515
)
1616

17-
// UseHAMTSharding is a global flag that signifies whether or not to use the
18-
// HAMT sharding scheme for directory creation
19-
var UseHAMTSharding = false
17+
// UseHAMTSharding is a global option that allows switching to a HAMTDirectory
18+
// when the BasicDirectory grows above the size (in bytes) signalled by this
19+
// flag. The default size of 0 disables the option.
20+
// The size is not the *exact* block size of the encoded BasicDirectory but just
21+
// the estimated size based byte length of links name and CID (BasicDirectory's
22+
// ProtoNode doesn't use the Data field so this estimate is pretty accurate).
23+
var HAMTShardingSize = 0
2024

2125
// DefaultShardWidth is the default value used for hamt sharding width.
2226
var DefaultShardWidth = 256
@@ -72,6 +76,12 @@ type Directory interface {
7276
type BasicDirectory struct {
7377
node *mdag.ProtoNode
7478
dserv ipld.DAGService
79+
80+
// Internal variable used to cache the estimated size used for the
81+
// HAMTShardingSize option. We maintain this value even if the
82+
// HAMTShardingSize is off since potentially the option could be activated
83+
// on the fly.
84+
estimatedSize int
7585
}
7686

7787
// HAMTDirectory is the HAMT implementation of `Directory`.
@@ -81,26 +91,29 @@ type HAMTDirectory struct {
8191
dserv ipld.DAGService
8292
}
8393

94+
func NewEmptyBasicDirectory(dserv ipld.DAGService) *BasicDirectory {
95+
return NewBasicDirectoryFromNode(dserv, format.EmptyDirNode())
96+
}
97+
98+
func NewBasicDirectoryFromNode(dserv ipld.DAGService, node *mdag.ProtoNode) *BasicDirectory {
99+
basicDir := new(BasicDirectory)
100+
basicDir.node = node
101+
basicDir.dserv = dserv
102+
103+
// Scan node links (if any) to restore estimated size.
104+
basicDir.ForEachLink(nil, func(l *ipld.Link) error {
105+
basicDir.addToEstimatedSize(l.Name, l.Cid)
106+
return nil
107+
})
108+
return basicDir
109+
}
110+
84111
// NewDirectory returns a Directory that can either be a HAMTDirectory if the
85112
// UseHAMTSharding is set, or otherwise an UpgradeableDirectory containing a
86113
// BasicDirectory that can be converted to a HAMTDirectory if the option is
87114
// set in the future.
88115
func NewDirectory(dserv ipld.DAGService) Directory {
89-
if UseHAMTSharding {
90-
dir := new(HAMTDirectory)
91-
s, err := hamt.NewShard(dserv, DefaultShardWidth)
92-
if err != nil {
93-
panic(err) // will only panic if DefaultShardWidth is a bad value
94-
}
95-
dir.shard = s
96-
dir.dserv = dserv
97-
return dir
98-
}
99-
100-
basicDir := new(BasicDirectory)
101-
basicDir.node = format.EmptyDirNode()
102-
basicDir.dserv = dserv
103-
return &UpgradeableDirectory{basicDir}
116+
return &UpgradeableDirectory{NewEmptyBasicDirectory(dserv)}
104117
}
105118

106119
// ErrNotADir implies that the given node was not a unixfs directory
@@ -121,10 +134,7 @@ func NewDirectoryFromNode(dserv ipld.DAGService, node ipld.Node) (Directory, err
121134

122135
switch fsNode.Type() {
123136
case format.TDirectory:
124-
return &BasicDirectory{
125-
dserv: dserv,
126-
node: protoBufNode.Copy().(*mdag.ProtoNode),
127-
}, nil
137+
return NewBasicDirectoryFromNode(dserv, protoBufNode.Copy().(*mdag.ProtoNode)), nil
128138
case format.THAMTShard:
129139
shard, err := hamt.NewHamtFromDag(dserv, node)
130140
if err != nil {
@@ -139,6 +149,19 @@ func NewDirectoryFromNode(dserv ipld.DAGService, node ipld.Node) (Directory, err
139149
return nil, ErrNotADir
140150
}
141151

152+
func (d *BasicDirectory) addToEstimatedSize(name string, linkCid cid.Cid) {
153+
d.estimatedSize += len(name) + len(linkCid.Bytes())
154+
// FIXME: Ideally we may want to track the Link size as well but it is
155+
// minor in comparison with the other two.
156+
}
157+
158+
func (d *BasicDirectory) removeFromEstimatedSize(name string, linkCid cid.Cid) {
159+
d.estimatedSize -= len(name) + len(linkCid.Bytes())
160+
if d.estimatedSize < 0 {
161+
panic("BasicDirectory's estimatedSize went below 0")
162+
}
163+
}
164+
142165
// SetCidBuilder implements the `Directory` interface.
143166
func (d *BasicDirectory) SetCidBuilder(builder cid.Builder) {
144167
d.node.SetCidBuilder(builder)
@@ -147,10 +170,15 @@ func (d *BasicDirectory) SetCidBuilder(builder cid.Builder) {
147170
// AddChild implements the `Directory` interface. It adds (or replaces)
148171
// a link to the given `node` under `name`.
149172
func (d *BasicDirectory) AddChild(ctx context.Context, name string, node ipld.Node) error {
150-
d.node.RemoveNodeLink(name)
151173
// Remove old link (if it existed), don't check a potential `ErrNotFound`.
174+
d.RemoveChild(ctx, name)
152175

153-
return d.node.AddNodeLink(name, node)
176+
err := d.node.AddNodeLink(name, node)
177+
if err != nil {
178+
return err
179+
}
180+
d.addToEstimatedSize(name, node.Cid())
181+
return nil
154182
}
155183

156184
// EnumLinksAsync returns a channel which will receive Links in the directory
@@ -203,11 +231,26 @@ func (d *BasicDirectory) Find(ctx context.Context, name string) (ipld.Node, erro
203231

204232
// RemoveChild implements the `Directory` interface.
205233
func (d *BasicDirectory) RemoveChild(ctx context.Context, name string) error {
206-
err := d.node.RemoveNodeLink(name)
234+
// We need to *retrieve* the link before removing it to update the estimated
235+
// size.
236+
// FIXME: If this is too much of a potential penalty we could leave a fixed
237+
// CID size estimation based on the most common one used (normally SHA-256).
238+
// Alternatively we could add a GetAndRemoveLink method in `merkledag` to
239+
// iterate node links slice only once.
240+
link, err := d.node.GetNodeLink(name)
207241
if err == mdag.ErrLinkNotFound {
208-
err = os.ErrNotExist
242+
return os.ErrNotExist
209243
}
210-
return err
244+
if err != nil {
245+
return err // at the moment there is no other error besides ErrLinkNotFound
246+
}
247+
248+
// The name actually existed so we should update the estimated size.
249+
d.removeFromEstimatedSize(link.Name, link.Cid)
250+
251+
return d.node.RemoveNodeLink(name)
252+
// GetNodeLink didn't return ErrLinkNotFound so this won't fail with that
253+
// and we don't need to convert the error again.
211254
}
212255

213256
// GetNode implements the `Directory` interface.
@@ -309,15 +352,31 @@ var _ Directory = (*UpgradeableDirectory)(nil)
309352
// AddChild implements the `Directory` interface. We check when adding new entries
310353
// if we should switch to HAMTDirectory according to global option(s).
311354
func (d *UpgradeableDirectory) AddChild(ctx context.Context, name string, nd ipld.Node) error {
312-
if UseHAMTSharding {
313-
if basicDir, ok := d.Directory.(*BasicDirectory); ok {
314-
hamtDir, err := basicDir.SwitchToSharding(ctx)
315-
if err != nil {
316-
return err
317-
}
318-
d.Directory = hamtDir
355+
err := d.Directory.AddChild(ctx, name, nd)
356+
if err != nil {
357+
return err
358+
}
359+
360+
// Evaluate possible HAMT upgrade.
361+
if HAMTShardingSize == 0 {
362+
return nil
363+
}
364+
basicDir, ok := d.Directory.(*BasicDirectory)
365+
if !ok {
366+
return nil
367+
}
368+
if basicDir.estimatedSize >= HAMTShardingSize {
369+
// FIXME: Ideally to minimize performance we should check if this last
370+
// `AddChild` call would bring the directory size over the threshold
371+
// *before* executing it since we would end up switching anyway and
372+
// that call would be "wasted". This is a minimal performance impact
373+
// and we prioritize a simple code base.
374+
hamtDir, err := basicDir.SwitchToSharding(ctx)
375+
if err != nil {
376+
return err
319377
}
378+
d.Directory = hamtDir
320379
}
321380

322-
return d.Directory.AddChild(ctx, name, nd)
381+
return nil
323382
}

io/directory_test.go

Lines changed: 97 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ package io
33
import (
44
"context"
55
"fmt"
6+
mdag "github.com/ipfs/go-merkledag"
7+
"math"
68
"testing"
79

810
ipld "github.com/ipfs/go-ipld-format"
@@ -98,27 +100,115 @@ func TestDuplicateAddDir(t *testing.T) {
98100
}
99101
}
100102

103+
// FIXME: Nothing blocking but nice to have:
104+
// * Check estimated size against link enumeration (indirectly done in the
105+
// restored node check from NewDirectoryFromNode).
106+
// * Check estimated size against encoded node (the difference should only be
107+
// a small percentage for a directory with 10s of entries).
108+
func TestBasicDirectory_estimatedSize(t *testing.T) {
109+
ds := mdtest.Mock()
110+
ctx := context.Background()
111+
child := ft.EmptyFileNode()
112+
err := ds.Add(ctx, child)
113+
if err != nil {
114+
t.Fatal(err)
115+
}
116+
117+
basicDir := NewEmptyBasicDirectory(ds)
118+
119+
// Several overwrites should not corrupt the size estimation.
120+
basicDir.AddChild(ctx, "child", child)
121+
basicDir.AddChild(ctx, "child", child)
122+
basicDir.AddChild(ctx, "child", child)
123+
basicDir.RemoveChild(ctx, "child")
124+
basicDir.AddChild(ctx, "child", child)
125+
basicDir.RemoveChild(ctx, "child")
126+
// FIXME: Check errors above (abstract adds/removals in iteration).
127+
if basicDir.estimatedSize != 0 {
128+
t.Fatal("estimated size is not zero after removing all entries")
129+
}
130+
131+
for i := 0; i < 100; i++ {
132+
basicDir.AddChild(ctx, fmt.Sprintf("child-%03d", i), child) // e.g., "child-045"
133+
}
134+
// Estimated entry size: name (9) + CID (32 from hash and 2 extra for header)
135+
entrySize := 9 + 32 + 2
136+
expectedSize := 100 * entrySize
137+
if basicDir.estimatedSize != expectedSize {
138+
t.Fatalf("estimated size (%d) inaccurate after adding many entries (expected %d)",
139+
basicDir.estimatedSize, expectedSize)
140+
}
141+
142+
basicDir.RemoveChild(ctx, "child-045") // just random values
143+
basicDir.RemoveChild(ctx, "child-063")
144+
basicDir.RemoveChild(ctx, "child-011")
145+
basicDir.RemoveChild(ctx, "child-000")
146+
basicDir.RemoveChild(ctx, "child-099")
147+
148+
basicDir.RemoveChild(ctx, "child-045") // already removed, won't impact size
149+
basicDir.RemoveChild(ctx, "nonexistent-name") // also doesn't count
150+
basicDir.RemoveChild(ctx, "child-100") // same
151+
expectedSize -= 5 * entrySize
152+
if basicDir.estimatedSize != expectedSize {
153+
t.Fatalf("estimated size (%d) inaccurate after removing some entries (expected %d)",
154+
basicDir.estimatedSize, expectedSize)
155+
}
156+
157+
// Restore a directory from original's node and check estimated size consistency.
158+
basicDirSingleNode, _ := basicDir.GetNode() // no possible error
159+
restoredBasicDir := NewBasicDirectoryFromNode(ds, basicDirSingleNode.(*mdag.ProtoNode))
160+
if basicDir.estimatedSize != restoredBasicDir.estimatedSize {
161+
t.Fatalf("restored basic directory size (%d) doesn't match original estimate (%d)",
162+
basicDir.estimatedSize, restoredBasicDir.estimatedSize)
163+
}
164+
}
165+
166+
// Basic test on extreme threshold to trigger switch. More fine-grained sizes
167+
// are checked in TestBasicDirectory_estimatedSize (without the swtich itself
168+
// but focusing on the size computation).
169+
// FIXME: Ideally, instead of checking size computation on one test and directory
170+
// upgrade on another a better structured test should test both dimensions
171+
// simultaneously.
101172
func TestUpgradeableDirectory(t *testing.T) {
102-
oldHamtOption := UseHAMTSharding
103-
defer func() { UseHAMTSharding = oldHamtOption }()
173+
oldHamtOption := HAMTShardingSize
174+
defer func() { HAMTShardingSize = oldHamtOption }()
104175

105176
ds := mdtest.Mock()
106-
UseHAMTSharding = false // Create a BasicDirectory.
107177
dir := NewDirectory(ds)
178+
ctx := context.Background()
179+
child := ft.EmptyDirNode()
180+
err := ds.Add(ctx, child)
181+
if err != nil {
182+
t.Fatal(err)
183+
}
184+
185+
HAMTShardingSize = 0 // Create a BasicDirectory.
108186
if _, ok := dir.(*UpgradeableDirectory).Directory.(*BasicDirectory); !ok {
109187
t.Fatal("UpgradeableDirectory doesn't contain BasicDirectory")
110188
}
111189

112-
// Any new directory entry will trigger the upgrade to HAMTDirectory
113-
UseHAMTSharding = true
190+
// Set a threshold so big a new entry won't trigger the change.
191+
HAMTShardingSize = math.MaxInt32
192+
193+
err = dir.AddChild(ctx, "test", child)
194+
if err != nil {
195+
t.Fatal(err)
196+
}
197+
198+
if _, ok := dir.(*UpgradeableDirectory).Directory.(*HAMTDirectory); ok {
199+
t.Fatal("UpgradeableDirectory was upgraded to HAMTDirectory for a large threshold")
200+
}
201+
202+
// Now set it so low to make sure any new entry will trigger the upgrade.
203+
HAMTShardingSize = 1
114204

115-
err := dir.AddChild(context.Background(), "test", ft.EmptyDirNode())
205+
err = dir.AddChild(ctx, "test", child) // overwriting an entry should also trigger the switch
116206
if err != nil {
117207
t.Fatal(err)
118208
}
119209

120210
if _, ok := dir.(*UpgradeableDirectory).Directory.(*HAMTDirectory); !ok {
121-
t.Fatal("UpgradeableDirectory wasn't upgraded to HAMTDirectory")
211+
t.Fatal("UpgradeableDirectory wasn't upgraded to HAMTDirectory for a low threshold")
122212
}
123213
}
124214

unixfs.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,11 @@ func EmptyDirNode() *dag.ProtoNode {
361361
return dag.NodeWithData(FolderPBData())
362362
}
363363

364+
// EmptyDirNode creates an empty folder Protonode.
365+
func EmptyFileNode() *dag.ProtoNode {
366+
return dag.NodeWithData(FilePBData(nil, 0))
367+
}
368+
364369
// ReadUnixFSNodeData extracts the UnixFS data from an IPLD node.
365370
// Raw nodes are (also) processed because they are used as leaf
366371
// nodes containing (only) UnixFS data.

0 commit comments

Comments
 (0)