Skip to content
This repository has been archived by the owner on Jun 27, 2023. It is now read-only.

feat: switch to HAMT based on size #91

Merged
merged 9 commits into from
May 7, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
feat: switch to HAMT based on size
  • Loading branch information
schomatis committed May 6, 2021
commit 51ba1e2de961619a5f9eafbba62bb867549215d0
129 changes: 94 additions & 35 deletions io/directory.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,13 @@ import (
ipld "github.com/ipfs/go-ipld-format"
)

// UseHAMTSharding is a global flag that signifies whether or not to use the
// HAMT sharding scheme for directory creation
var UseHAMTSharding = false
// UseHAMTSharding is a global option that allows switching to a HAMTDirectory
// when the BasicDirectory grows above the size (in bytes) signalled by this
// flag. The default size of 0 disables the option.
// The size is not the *exact* block size of the encoded BasicDirectory but just
// the estimated size based byte length of links name and CID (BasicDirectory's
// ProtoNode doesn't use the Data field so this estimate is pretty accurate).
var HAMTShardingSize = 0

// DefaultShardWidth is the default value used for hamt sharding width.
var DefaultShardWidth = 256
Expand Down Expand Up @@ -72,6 +76,12 @@ type Directory interface {
type BasicDirectory struct {
node *mdag.ProtoNode
dserv ipld.DAGService

// Internal variable used to cache the estimated size used for the
// HAMTShardingSize option. We maintain this value even if the
// HAMTShardingSize is off since potentially the option could be activated
// on the fly.
estimatedSize int
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's:

  1. Document the exact algorithm.
  2. Maybe call this "linkdataSize", or something equally inscrutable.

This "estimate" needs to accurately follow the algorithm to achieve convergence. While this is more of a "SHOULD" from a spec standpoint, I'd like to be as accurate as possible.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The full documentation is in the exported HAMTShardingSize (referenced here) which is what the user has access to; this is just an internal cache. I'd like to retain this name (or something similar to this effect) to better represent what we want to achieve (estimation of directory size) rather than the how (traversing links).

Also not following the talk about a spec (even if figuratively) and a reference to an algorithm which I'm not sure what part of the code it is designating (is it the addToEstimatedSize/removeFromEstimatedSize functions?).

All this said, if this is a blocking change feel free to push any documentation you want or rename however you prefer. Not very opinionated on this and it won't change how this PR works anyway.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess I just want something to indicate that this isn't just some random estimate that someone can just change, but that's not critical or merge blocking.

}

// HAMTDirectory is the HAMT implementation of `Directory`.
Expand All @@ -81,26 +91,29 @@ type HAMTDirectory struct {
dserv ipld.DAGService
}

func NewEmptyBasicDirectory(dserv ipld.DAGService) *BasicDirectory {
schomatis marked this conversation as resolved.
Show resolved Hide resolved
return NewBasicDirectoryFromNode(dserv, format.EmptyDirNode())
}

func NewBasicDirectoryFromNode(dserv ipld.DAGService, node *mdag.ProtoNode) *BasicDirectory {
schomatis marked this conversation as resolved.
Show resolved Hide resolved
basicDir := new(BasicDirectory)
basicDir.node = node
basicDir.dserv = dserv

// Scan node links (if any) to restore estimated size.
basicDir.ForEachLink(nil, func(l *ipld.Link) error {
basicDir.addToEstimatedSize(l.Name, l.Cid)
return nil
})
return basicDir
}

// NewDirectory returns a Directory that can either be a HAMTDirectory if the
// UseHAMTSharding is set, or otherwise an UpgradeableDirectory containing a
// BasicDirectory that can be converted to a HAMTDirectory if the option is
// set in the future.
func NewDirectory(dserv ipld.DAGService) Directory {
if UseHAMTSharding {
dir := new(HAMTDirectory)
s, err := hamt.NewShard(dserv, DefaultShardWidth)
if err != nil {
panic(err) // will only panic if DefaultShardWidth is a bad value
}
dir.shard = s
dir.dserv = dserv
return dir
}

basicDir := new(BasicDirectory)
basicDir.node = format.EmptyDirNode()
basicDir.dserv = dserv
return &UpgradeableDirectory{basicDir}
return &UpgradeableDirectory{NewEmptyBasicDirectory(dserv)}
}

// ErrNotADir implies that the given node was not a unixfs directory
Expand All @@ -121,10 +134,7 @@ func NewDirectoryFromNode(dserv ipld.DAGService, node ipld.Node) (Directory, err

switch fsNode.Type() {
case format.TDirectory:
return &BasicDirectory{
dserv: dserv,
node: protoBufNode.Copy().(*mdag.ProtoNode),
}, nil
return NewBasicDirectoryFromNode(dserv, protoBufNode.Copy().(*mdag.ProtoNode)), nil
case format.THAMTShard:
shard, err := hamt.NewHamtFromDag(dserv, node)
if err != nil {
Expand All @@ -139,6 +149,19 @@ func NewDirectoryFromNode(dserv ipld.DAGService, node ipld.Node) (Directory, err
return nil, ErrNotADir
}

func (d *BasicDirectory) addToEstimatedSize(name string, linkCid cid.Cid) {
d.estimatedSize += len(name) + len(linkCid.Bytes())
schomatis marked this conversation as resolved.
Show resolved Hide resolved
// FIXME: Ideally we may want to track the Link size as well but it is
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's remove this FIXME, we don't want someone to actually come along and fix this in code (it's a spec change, effectively.

// minor in comparison with the other two.
}

func (d *BasicDirectory) removeFromEstimatedSize(name string, linkCid cid.Cid) {
d.estimatedSize -= len(name) + len(linkCid.Bytes())
schomatis marked this conversation as resolved.
Show resolved Hide resolved
if d.estimatedSize < 0 {
panic("BasicDirectory's estimatedSize went below 0")
schomatis marked this conversation as resolved.
Show resolved Hide resolved
}
}

// SetCidBuilder implements the `Directory` interface.
func (d *BasicDirectory) SetCidBuilder(builder cid.Builder) {
d.node.SetCidBuilder(builder)
Expand All @@ -147,10 +170,15 @@ func (d *BasicDirectory) SetCidBuilder(builder cid.Builder) {
// AddChild implements the `Directory` interface. It adds (or replaces)
// a link to the given `node` under `name`.
func (d *BasicDirectory) AddChild(ctx context.Context, name string, node ipld.Node) error {
d.node.RemoveNodeLink(name)
// Remove old link (if it existed), don't check a potential `ErrNotFound`.
d.RemoveChild(ctx, name)
schomatis marked this conversation as resolved.
Show resolved Hide resolved

return d.node.AddNodeLink(name, node)
err := d.node.AddNodeLink(name, node)
if err != nil {
return err
}
d.addToEstimatedSize(name, node.Cid())
return nil
}

// EnumLinksAsync returns a channel which will receive Links in the directory
Expand Down Expand Up @@ -203,11 +231,26 @@ func (d *BasicDirectory) Find(ctx context.Context, name string) (ipld.Node, erro

// RemoveChild implements the `Directory` interface.
func (d *BasicDirectory) RemoveChild(ctx context.Context, name string) error {
err := d.node.RemoveNodeLink(name)
// We need to *retrieve* the link before removing it to update the estimated
// size.
// FIXME: If this is too much of a potential penalty we could leave a fixed
// CID size estimation based on the most common one used (normally SHA-256).
// Alternatively we could add a GetAndRemoveLink method in `merkledag` to
// iterate node links slice only once.
link, err := d.node.GetNodeLink(name)
schomatis marked this conversation as resolved.
Show resolved Hide resolved
if err == mdag.ErrLinkNotFound {
err = os.ErrNotExist
return os.ErrNotExist
}
return err
if err != nil {
return err // at the moment there is no other error besides ErrLinkNotFound
}

// The name actually existed so we should update the estimated size.
d.removeFromEstimatedSize(link.Name, link.Cid)

return d.node.RemoveNodeLink(name)
// GetNodeLink didn't return ErrLinkNotFound so this won't fail with that
// and we don't need to convert the error again.
}

// GetNode implements the `Directory` interface.
Expand Down Expand Up @@ -309,15 +352,31 @@ var _ Directory = (*UpgradeableDirectory)(nil)
// AddChild implements the `Directory` interface. We check when adding new entries
// if we should switch to HAMTDirectory according to global option(s).
func (d *UpgradeableDirectory) AddChild(ctx context.Context, name string, nd ipld.Node) error {
if UseHAMTSharding {
if basicDir, ok := d.Directory.(*BasicDirectory); ok {
hamtDir, err := basicDir.SwitchToSharding(ctx)
if err != nil {
return err
}
d.Directory = hamtDir
err := d.Directory.AddChild(ctx, name, nd)
if err != nil {
return err
}

// Evaluate possible HAMT upgrade.
if HAMTShardingSize == 0 {
return nil
}
basicDir, ok := d.Directory.(*BasicDirectory)
if !ok {
return nil
}
if basicDir.estimatedSize >= HAMTShardingSize {
// FIXME: Ideally to minimize performance we should check if this last
// `AddChild` call would bring the directory size over the threshold
// *before* executing it since we would end up switching anyway and
// that call would be "wasted". This is a minimal performance impact
// and we prioritize a simple code base.
schomatis marked this conversation as resolved.
Show resolved Hide resolved
hamtDir, err := basicDir.SwitchToSharding(ctx)
if err != nil {
return err
}
d.Directory = hamtDir
}

return d.Directory.AddChild(ctx, name, nd)
return nil
}
104 changes: 97 additions & 7 deletions io/directory_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ package io
import (
"context"
"fmt"
mdag "github.com/ipfs/go-merkledag"
schomatis marked this conversation as resolved.
Show resolved Hide resolved
"math"
"testing"

ipld "github.com/ipfs/go-ipld-format"
Expand Down Expand Up @@ -98,27 +100,115 @@ func TestDuplicateAddDir(t *testing.T) {
}
}

// FIXME: Nothing blocking but nice to have:
// * Check estimated size against link enumeration (indirectly done in the
// restored node check from NewDirectoryFromNode).
// * Check estimated size against encoded node (the difference should only be
// a small percentage for a directory with 10s of entries).
func TestBasicDirectory_estimatedSize(t *testing.T) {
ds := mdtest.Mock()
ctx := context.Background()
child := ft.EmptyFileNode()
err := ds.Add(ctx, child)
if err != nil {
t.Fatal(err)
}

basicDir := NewEmptyBasicDirectory(ds)

// Several overwrites should not corrupt the size estimation.
basicDir.AddChild(ctx, "child", child)
basicDir.AddChild(ctx, "child", child)
basicDir.AddChild(ctx, "child", child)
basicDir.RemoveChild(ctx, "child")
basicDir.AddChild(ctx, "child", child)
basicDir.RemoveChild(ctx, "child")
// FIXME: Check errors above (abstract adds/removals in iteration).
if basicDir.estimatedSize != 0 {
t.Fatal("estimated size is not zero after removing all entries")
}

for i := 0; i < 100; i++ {
basicDir.AddChild(ctx, fmt.Sprintf("child-%03d", i), child) // e.g., "child-045"
}
// Estimated entry size: name (9) + CID (32 from hash and 2 extra for header)
entrySize := 9 + 32 + 2
expectedSize := 100 * entrySize
if basicDir.estimatedSize != expectedSize {
t.Fatalf("estimated size (%d) inaccurate after adding many entries (expected %d)",
basicDir.estimatedSize, expectedSize)
}

basicDir.RemoveChild(ctx, "child-045") // just random values
basicDir.RemoveChild(ctx, "child-063")
basicDir.RemoveChild(ctx, "child-011")
basicDir.RemoveChild(ctx, "child-000")
basicDir.RemoveChild(ctx, "child-099")

basicDir.RemoveChild(ctx, "child-045") // already removed, won't impact size
basicDir.RemoveChild(ctx, "nonexistent-name") // also doesn't count
basicDir.RemoveChild(ctx, "child-100") // same
expectedSize -= 5 * entrySize
if basicDir.estimatedSize != expectedSize {
t.Fatalf("estimated size (%d) inaccurate after removing some entries (expected %d)",
basicDir.estimatedSize, expectedSize)
}

// Restore a directory from original's node and check estimated size consistency.
basicDirSingleNode, _ := basicDir.GetNode() // no possible error
restoredBasicDir := NewBasicDirectoryFromNode(ds, basicDirSingleNode.(*mdag.ProtoNode))
if basicDir.estimatedSize != restoredBasicDir.estimatedSize {
t.Fatalf("restored basic directory size (%d) doesn't match original estimate (%d)",
basicDir.estimatedSize, restoredBasicDir.estimatedSize)
}
}

// Basic test on extreme threshold to trigger switch. More fine-grained sizes
// are checked in TestBasicDirectory_estimatedSize (without the swtich itself
// but focusing on the size computation).
// FIXME: Ideally, instead of checking size computation on one test and directory
// upgrade on another a better structured test should test both dimensions
// simultaneously.
func TestUpgradeableDirectory(t *testing.T) {
oldHamtOption := UseHAMTSharding
defer func() { UseHAMTSharding = oldHamtOption }()
oldHamtOption := HAMTShardingSize
defer func() { HAMTShardingSize = oldHamtOption }()

ds := mdtest.Mock()
UseHAMTSharding = false // Create a BasicDirectory.
dir := NewDirectory(ds)
ctx := context.Background()
child := ft.EmptyDirNode()
err := ds.Add(ctx, child)
if err != nil {
t.Fatal(err)
}

HAMTShardingSize = 0 // Create a BasicDirectory.
if _, ok := dir.(*UpgradeableDirectory).Directory.(*BasicDirectory); !ok {
t.Fatal("UpgradeableDirectory doesn't contain BasicDirectory")
}

// Any new directory entry will trigger the upgrade to HAMTDirectory
UseHAMTSharding = true
// Set a threshold so big a new entry won't trigger the change.
HAMTShardingSize = math.MaxInt32

err = dir.AddChild(ctx, "test", child)
if err != nil {
t.Fatal(err)
}

if _, ok := dir.(*UpgradeableDirectory).Directory.(*HAMTDirectory); ok {
t.Fatal("UpgradeableDirectory was upgraded to HAMTDirectory for a large threshold")
}

// Now set it so low to make sure any new entry will trigger the upgrade.
HAMTShardingSize = 1

err := dir.AddChild(context.Background(), "test", ft.EmptyDirNode())
err = dir.AddChild(ctx, "test", child) // overwriting an entry should also trigger the switch
if err != nil {
t.Fatal(err)
}

if _, ok := dir.(*UpgradeableDirectory).Directory.(*HAMTDirectory); !ok {
t.Fatal("UpgradeableDirectory wasn't upgraded to HAMTDirectory")
t.Fatal("UpgradeableDirectory wasn't upgraded to HAMTDirectory for a low threshold")
}
}

Expand Down
5 changes: 5 additions & 0 deletions unixfs.go
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,11 @@ func EmptyDirNode() *dag.ProtoNode {
return dag.NodeWithData(FolderPBData())
}

// EmptyDirNode creates an empty folder Protonode.
func EmptyFileNode() *dag.ProtoNode {
schomatis marked this conversation as resolved.
Show resolved Hide resolved
return dag.NodeWithData(FilePBData(nil, 0))
}

// ReadUnixFSNodeData extracts the UnixFS data from an IPLD node.
// Raw nodes are (also) processed because they are used as leaf
// nodes containing (only) UnixFS data.
Expand Down