Skip to content

Commit

Permalink
feat: auto-shard based on node size
Browse files Browse the repository at this point in the history
js counterpart to ipfs/kubo#8114

Changes the `shardSplitThreshold` parameter to mean the size of the
final DAGNode (including link names, sizes, etc) instead of the
number of entries in the directory.

Fixes: #149

BREAKING CHANGE: `shardSplitThreshold` now refers to node size, not number of entries
  • Loading branch information
achingbrain committed Sep 12, 2021
1 parent 0f9092e commit 3d235dc
Show file tree
Hide file tree
Showing 8 changed files with 154 additions and 13 deletions.
2 changes: 1 addition & 1 deletion packages/ipfs-unixfs-importer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ The input's file paths and directory structure will be preserved in the [`dag-pb
`options` is an JavaScript option that might include the following keys:

- `wrapWithDirectory` (boolean, defaults to false): if true, a wrapping node will be created
- `shardSplitThreshold` (positive integer, defaults to 1000): the number of directory entries above which we decide to use a sharding directory builder (instead of the default flat one)
- `shardSplitThreshold` (positive integer, defaults to 256KiB): if the serialized node is larger than this it will be converted to a HAMT sharded directory
- `chunker` (string, defaults to `"fixed"`): the chunking strategy. Supports:
- `fixed`
- `rabin`
Expand Down
40 changes: 39 additions & 1 deletion packages/ipfs-unixfs-importer/src/dir-flat.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { encode, prepare } from '@ipld/dag-pb'
import { UnixFS } from 'ipfs-unixfs'
import Dir from './dir.js'
import { Dir, CID_V0, CID_V1 } from './dir.js'
import persist from './utils/persist.js'

/**
Expand Down Expand Up @@ -32,6 +32,7 @@ class DirFlat extends Dir {
async put (name, value) {
this.cid = undefined
this.size = undefined
this.nodeSize = undefined

this._children[name] = value
}
Expand Down Expand Up @@ -68,6 +69,43 @@ class DirFlat extends Dir {
}
}

calculateNodeSize () {
if (this.nodeSize !== undefined) {
return this.nodeSize
}

const links = []

for (const name of Object.keys(this._children)) {
const child = this._children[name]
let size

if (child instanceof Dir) {
size = child.calculateNodeSize()
} else {
size = child.size
}

if (child.size != null && child.cid) {
links.push({
Name: name,
Tsize: size,
Hash: this.options.cidVersion === 0 ? CID_V0 : CID_V1
})
}
}

const unixfs = new UnixFS({
type: 'directory',
mtime: this.mtime,
mode: this.mode
})

this.nodeSize = encode(prepare({ Data: unixfs.marshal(), Links: links })).length

return this.nodeSize
}

/**
* @param {Blockstore} block
* @returns {AsyncIterable<ImportResult>}
Expand Down
91 changes: 89 additions & 2 deletions packages/ipfs-unixfs-importer/src/dir-sharded.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { encode, prepare } from '@ipld/dag-pb'
import { UnixFS } from 'ipfs-unixfs'
import Dir from './dir.js'
import { Dir, CID_V0, CID_V1 } from './dir.js'
import persist from './utils/persist.js'
import { createHAMT, Bucket } from 'hamt-sharding'

Expand Down Expand Up @@ -35,6 +35,10 @@ class DirSharded extends Dir {
* @param {InProgressImportResult | Dir} value
*/
async put (name, value) {
this.cid = undefined
this.size = undefined
this.nodeSize = undefined

await this._bucket.put(name, value)
}

Expand Down Expand Up @@ -66,6 +70,16 @@ class DirSharded extends Dir {
}
}

calculateNodeSize () {
if (this.nodeSize !== undefined) {
return this.nodeSize
}

this.nodeSize = calculateSize(this._bucket, this, this.options)

return this.nodeSize
}

/**
* @param {Blockstore} blockstore
* @returns {AsyncIterable<ImportResult>}
Expand All @@ -85,7 +99,7 @@ export default DirSharded
/**
* @param {Bucket<?>} bucket
* @param {Blockstore} blockstore
* @param {*} shardRoot
* @param {DirSharded | null} shardRoot
* @param {ImporterOptions} options
* @returns {AsyncIterable<ImportResult>}
*/
Expand Down Expand Up @@ -183,3 +197,76 @@ async function * flush (bucket, blockstore, shardRoot, options) {
size
}
}

/**
* @param {Bucket<?>} bucket
* @param {DirSharded | null} shardRoot
* @param {ImporterOptions} options
*/
function calculateSize (bucket, shardRoot, options) {
const children = bucket._children
const links = []

for (let i = 0; i < children.length; i++) {
const child = children.get(i)

if (!child) {
continue
}

const labelPrefix = i.toString(16).toUpperCase().padStart(2, '0')

if (child instanceof Bucket) {
const size = calculateSize(child, null, options)

links.push({
Name: labelPrefix,
Tsize: size,
Hash: options.cidVersion === 0 ? CID_V0 : CID_V1
})
} else if (typeof child.value.flush === 'function') {
const dir = child.value
const size = dir.nodeSize()

links.push({
Name: labelPrefix + child.key,
Tsize: size,
Hash: options.cidVersion === 0 ? CID_V0 : CID_V1
})
} else {
const value = child.value

if (!value.cid) {
continue
}

const label = labelPrefix + child.key
const size = value.size

links.push({
Name: label,
Tsize: size,
Hash: value.cid
})
}
}

// go-ipfs uses little endian, that's why we have to
// reverse the bit field before storing it
const data = Uint8Array.from(children.bitField().reverse())
const dir = new UnixFS({
type: 'hamt-sharded-directory',
data,
fanout: bucket.tableSize(),
hashType: options.hamtHashCode,
mtime: shardRoot && shardRoot.mtime,
mode: shardRoot && shardRoot.mode
})

const buffer = encode(prepare({
Data: dir.marshal(),
Links: links
}))

return buffer.length
}
21 changes: 18 additions & 3 deletions packages/ipfs-unixfs-importer/src/dir.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import { CID } from 'multiformats/cid'

/**
* @typedef {import('./types').ImporterOptions} ImporterOptions
* @typedef {import('./types').ImportResult} ImportResult
* @typedef {import('./types').InProgressImportResult} InProgressImportResult
* @typedef {import('interface-blockstore').Blockstore} Blockstore
* @typedef {import('multiformats/cid').CID} CID
*
* @typedef {object} DirProps
* @property {boolean} root
Expand All @@ -17,7 +18,7 @@
* @property {number} [mode]
* @property {import('ipfs-unixfs').Mtime} [mtime]
*/
class Dir {
export class Dir {
/**
* @param {DirProps} props
* @param {ImporterOptions} options
Expand All @@ -40,6 +41,8 @@ class Dir {
this.cid = undefined
/** @type {number | undefined} */
this.size = undefined
/** @type {number | undefined} */
this.nodeSize = undefined
}

/**
Expand All @@ -66,6 +69,18 @@ class Dir {
* @returns {AsyncIterable<ImportResult>}
*/
async * flush (blockstore) { }

/**
* @returns {number}
*/
calculateNodeSize () {
return 0
}
}

export default Dir
// we use these to calculate the node size to use as a check for whether a directory
// should be sharded or not. Since CIDs have a constant length and We're only
// interested in the data length and not the actual content identifier we can use
// any old CID instead of having to hash the data which is expensive.
export const CID_V0 = CID.parse('QmUNLLsPACCz1vLxQVkXqqLX5R1X345qqfHbsf67hvA3Nn')
export const CID_V1 = CID.parse('zdj7WbTaiJT1fgatdet9Ei9iDB5hdCxkbVyhyh8YTUnXMiwYi')
4 changes: 2 additions & 2 deletions packages/ipfs-unixfs-importer/src/flat-to-shard.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import DirSharded from './dir-sharded.js'
import DirFlat from './dir-flat.js'

/**
* @typedef {import('./dir').default} Dir
* @typedef {import('./dir').Dir} Dir
* @typedef {import('./types').ImporterOptions} ImporterOptions
*/

Expand All @@ -16,7 +16,7 @@ import DirFlat from './dir-flat.js'
async function flatToShard (child, dir, threshold, options) {
let newDir = dir

if (dir instanceof DirFlat && dir.directChildrenCount() >= threshold) {
if (dir instanceof DirFlat && dir.calculateNodeSize() > threshold) {
newDir = await convertToShard(dir, options)
}

Expand Down
3 changes: 2 additions & 1 deletion packages/ipfs-unixfs-importer/src/options.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ const defaultOptions = {
leafType: 'file', // 'raw'
cidVersion: 0,
progress: () => () => {},
shardSplitThreshold: 1000,
// https://github.com/ipfs/go-ipfs/pull/8114/files#diff-eec963b47a6e1080d9d8023b4e438e6e3591b4154f7379a7e728401d2055374aR319
shardSplitThreshold: 262144,
fileImportConcurrency: 50,
blockWriteConcurrency: 10,
minChunkSize: 262144,
Expand Down
2 changes: 1 addition & 1 deletion packages/ipfs-unixfs-importer/src/tree-builder.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import DirFlat from './dir-flat.js'
import flatToShard from './flat-to-shard.js'
import Dir from './dir.js'
import { Dir } from './dir.js'
import toPathComponents from './utils/to-path-components.js'

/**
Expand Down
4 changes: 2 additions & 2 deletions packages/ipfs-unixfs/src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ const DEFAULT_FILE_MODE = parseInt('0644', 8)
const DEFAULT_DIRECTORY_MODE = parseInt('0755', 8)

/**
* @param {string | number | undefined} [mode]
* @param {string | number | null | undefined} [mode]
*/
export function parseMode (mode) {
if (mode == null) {
Expand Down Expand Up @@ -161,7 +161,7 @@ class UnixFS {
* @param {number} [options.hashType]
* @param {number} [options.fanout]
* @param {MtimeLike | null} [options.mtime]
* @param {number | string} [options.mode]
* @param {number | string | null} [options.mode]
*/
constructor (options = {
type: 'file'
Expand Down

0 comments on commit 3d235dc

Please sign in to comment.