Skip to content
This repository was archived by the owner on Feb 12, 2024. It is now read-only.

Implement Garbage Collection #2022

Merged
merged 56 commits into from
Aug 26, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
eb4ab1f
feat: implement ipfs refs
dirkmc Apr 22, 2019
761e305
feat: refs support in http api
dirkmc Apr 24, 2019
2f632ec
feat: use ipld instead of unix-fs-exporter for refs
dirkmc Apr 28, 2019
7b498fb
feat: refs local
dirkmc Apr 30, 2019
c8e964a
feat: add refs.localPullStream && refs.localReadableStream
dirkmc May 1, 2019
0137caf
feat: make object.links work with CBOR
dirkmc May 3, 2019
f6d7a2a
feat: handle multiple refs. Better param handling
dirkmc May 3, 2019
793a355
feat: GC
dirkmc May 8, 2019
719a9f9
chore: add comment to explain cli param parsing
dirkmc May 8, 2019
a5db723
refactor: move links retrieval from object to refs
dirkmc May 9, 2019
ae27eb5
feat: expose GC to http api
dirkmc May 10, 2019
1e3aedc
test: unskip repo gc test
dirkmc May 10, 2019
df86ce4
fix: refactor and fix some bugs with GC
dirkmc May 10, 2019
0d5085d
feat: GC locking
dirkmc May 20, 2019
43b1720
test: add gc locking tests
dirkmc May 21, 2019
d970a32
refactor: rebase
dirkmc May 21, 2019
3ec57d9
fix: gc use uppercase dag.Links
dirkmc May 21, 2019
255dee3
chore: update package.json deps
dirkmc May 21, 2019
c8d1f08
chore: rebase
dirkmc May 21, 2019
568a1d9
chore: add joi to package.json
dirkmc May 22, 2019
c0007af
refactor: pin/gc common code
dirkmc May 22, 2019
8de0c2b
fix: browser gc tests
dirkmc May 22, 2019
28b615d
fix: gc parsing of block cid in browser
dirkmc May 22, 2019
05ae894
test: add gc-lock tests
dirkmc May 23, 2019
8b52444
fix: gc lock error handling
dirkmc May 23, 2019
0c7cfdf
fix: gc - take pin lock after resolve
dirkmc May 23, 2019
81e3dd0
fix: make sure each GCLock instance uses distinct mutex
dirkmc May 23, 2019
7898ca2
fix: choose non-overlapping port for GC test
dirkmc May 24, 2019
bdcbddb
fix: better gc test port config
dirkmc May 24, 2019
a07ed7f
test: increase timeout for repo gc test
dirkmc May 24, 2019
ef86efc
fix: webworkers + mortice
dirkmc May 28, 2019
75159a0
chore: refactor mortice options
dirkmc May 28, 2019
c2b5ef6
fix: gc rm test on Windows
dirkmc May 28, 2019
7cfc53b
fix: ensure gc filters all internal pins
dirkmc Jun 7, 2019
026158f
test: enable gc tests over ipfs-http-client
dirkmc Jun 7, 2019
bf4e731
chore: better gc logging
dirkmc Jun 10, 2019
356e263
fix: pin walking
dirkmc Jun 14, 2019
58f34d6
refactor: pin set walking
dirkmc Jun 21, 2019
168046a
refactor: import pull modules directly
dirkmc Jul 9, 2019
216e53a
chore: update mortice package
dirkmc Jul 9, 2019
4712178
refactor: use assert.fail() instead of throwing for programmer err
dirkmc Jul 9, 2019
681b577
chore: lint fixes
dirkmc Jul 9, 2019
c81a920
chore: update ipfs-http-client
dirkmc Jul 10, 2019
11a02dd
fix: path to gc-lock
dirkmc Jul 10, 2019
1e4c97a
fix: apply review comments
dirkmc Jul 15, 2019
a8c362c
chore: address review comments
dirkmc Jul 17, 2019
43a2644
refactor: simplify gc-lock
dirkmc Jul 18, 2019
c992c51
refactor: move EventEmitter from GCLock to test
dirkmc Jul 18, 2019
c364b19
refactor: better default args handing in Mutex
dirkmc Jul 18, 2019
358b957
fix: lint fixes
dirkmc Jul 19, 2019
bcc8a69
fix: use repo path as mortice id
dirkmc Jul 19, 2019
24c072e
test: add repo gc cli tests
dirkmc Jul 22, 2019
97f8054
fix: remove hacky cocde
dirkmc Jul 22, 2019
36f45f4
test: sharness test for GC
dirkmc Jul 23, 2019
d085a30
fix: review feedback
Aug 21, 2019
f869455
fix: Do not load all of a DAG into memory when pinning (#2387)
achingbrain Aug 23, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
refactor: move EventEmitter from GCLock to test
  • Loading branch information
dirkmc authored and Alan Shaw committed Aug 26, 2019
commit c992c51e4db6d51e2a6d3355cda5a6b689aea2da
44 changes: 21 additions & 23 deletions src/core/components/pin/gc-lock.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,19 @@
const pull = require('pull-stream/pull')
const pullThrough = require('pull-stream/throughs/through')
const pullAsyncMap = require('pull-stream/throughs/async-map')
const EventEmitter = require('events')
const Mutex = require('../../../utils/mutex')
const log = require('debug')('ipfs:gc:lock')

class GCLock extends EventEmitter {
class GCLock {
constructor (repoOwner) {
super()

this.mutex = new Mutex(repoOwner, { log })
}

readLock (lockedFn, cb) {
this.emit(`readLock request`)
return this.mutex.readLock(lockedFn, cb)
}

writeLock (lockedFn, cb) {
this.emit(`writeLock request`)
return this.mutex.writeLock(lockedFn, cb)
}

Expand All @@ -33,7 +28,7 @@ class GCLock extends EventEmitter {
}

pullLock (type, lockedPullFn) {
const pullLocker = new PullLocker(this, this.mutex, type, this.lockId++)
const pullLocker = new PullLocker(this.mutex, type)

return pull(
pullLocker.take(),
Expand All @@ -44,8 +39,7 @@ class GCLock extends EventEmitter {
}

class PullLocker {
constructor (emitter, mutex, type) {
this.emitter = emitter
constructor (mutex, type) {
this.mutex = mutex
this.type = type

Expand All @@ -54,26 +48,30 @@ class PullLocker {
}

take () {
return pull(
pullAsyncMap((i, cb) => {
if (this.lockRequested) {
return cb(null, i)
}
this.lockRequested = true

this.emitter.emit(`${this.type} request`)

this.mutex[this.type]((releaseLock) => {
cb(null, i)
this.releaseLock = releaseLock
})
return pullAsyncMap((i, cb) => {
// Check if the lock has already been acquired.
// Note: new items will only come through the pull stream once the first
// item has acquired a lock.
if (this.releaseLock) {
// The lock has been acquired so return immediately
return cb(null, i)
}

// Request the lock
this.mutex[this.type]((releaseLock) => {
// The lock has been granted, so run the locked piece of code
cb(null, i)

// Save the release function to be called when the stream completes
this.releaseLock = releaseLock
})
)
})
}

// Releases the lock
release () {
return pullThrough(null, (err) => {
// When the stream completes, release the lock
this.releaseLock(err)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this doesn't need to be protected ? like

if(this.releaseLock) {
	this.releaseLock(err)
	this.releaseLock = null
}else{
	throw new Error('No lock to release')
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because release() is called after the stream completes, it shouldn't be possible to get here without a lock having been acquired (and this.releaseLock being set)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what happen if lockedPullFn() doesnt work properly, throws or break flow ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lockedPullFn() returns a pull stream, so it depends on what that pull-stream does

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what im asking is if lockedPullFn() does something wrong is this code still safe ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't believe that will affect anything in the PullLocker.release() function - the second callback to pullThrough() should only be invoked once the stream ends.

If an exception is thrown from within the pull-stream returned by lockPullFn() I believe it will be thrown outside the control of the locking class.

})
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The pipeline becomes something like:

pull(
  ...input,
  take,
  process,
  release,
  ...output
)

The problem I see with this is that we may end up taking a lock for significantly longer than we may need to due to slow user pipeline actions.

For example, imagine an output action that takes 1s. After we have processed the last chunk of input, we're going to have to wait a further second before the lock is released, since the release happens when the entire pipeline ends, not immediately after we've processed the last chunk.

Can we eagerly request the next chunk here so we know when the source is drained before the sink requests the last read? This sounds like a module that should already exist!

I don't know how we'd mitigate this on the source side, maybe buffer up a certain amount before we start processing it (this won't help with big files). You could imagine a slow source being someone adding something over the HTTP API.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a great point, locking with pull streams is very tricky. Maybe it's better to take out the pull stream locking code, and instead have an explicit lock and release at the start and end of the locked portion of code?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's kinda what you're doing isn't it?...but in a helper function that adds both take and release to the pipeline.

How would you see this working for addPullStream?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right there isn't really another way to do it. I think your suggestion makes the most sense. We'd need to think about how to implement that in a way that doesn't make this even more complex.

}
Expand Down
2 changes: 1 addition & 1 deletion src/utils/mutex.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ const noop = () => {}

// Wrap mortice to present a callback interface
class Mutex {
constructor (repoOwner, options) {
constructor (repoOwner, options = {}) {
// Ensure that we get a different mutex for each instance of the lock
const randId = nanoid()
this.mutex = mortice(randId, {
Expand Down
46 changes: 37 additions & 9 deletions test/core/gc.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,28 @@ const pEvent = require('p-event')
const env = require('ipfs-utils/src/env')
const IPFS = require('../../src/core')

// We need to detect when a readLock or writeLock is requested for the tests
// so we override the Mutex class to emit an event
const EventEmitter = require('events')
const Mutex = require('../../src/utils/mutex')

class MutexEmitter extends Mutex {
constructor (repoOwner) {
super(repoOwner)
this.emitter = new EventEmitter()
}

readLock (lockedFn, cb) {
this.emitter.emit('readLock request')
return super.readLock(lockedFn, cb)
}

writeLock (lockedFn, cb) {
this.emitter.emit('writeLock request')
return super.writeLock(lockedFn, cb)
}
}

describe('gc', function () {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A couple of questions:

  • What is the largest repo size that gc has been tested with?
  • How do these tests compare to go-ipfs gc tests?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm asking this as I expect people to use GC when their repo gets of a significant size (10GB +++) and not when it is small.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point 👍

I have been testing locally with repos a few hundred MB in size:

$ du -h ~/.jsipfs | tail -1
621M	/Users/dirk/.jsipfs

$ time js-ipfs repo gc
real	0m4.141s
user	0m0.858s
sys	0m0.174s

I haven't performed stress testing with repos of the size you're suggesting.

I had a look at go-ipfs, it seems like there are some sharness tests for GC, but I didn't see any stress tests.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just tried with an 11GB repo and it ran in a few seconds. I will add a sharness test for stress-testing

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

11GB repo and how large of a GC? half of it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GC removes all blocks that are not pinned (same as go-ipfs). In this case I added 11GB of unpinned files and then GCed them.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a sharness test for GC, in which the number of files and their size can be easily adjusted. It seems like GC performance is linear:

Screen Shot 2019-07-23 at 1 30 17 PM

Spreadsheet

const fixtures = [{
path: 'test/my/path1',
Expand All @@ -29,6 +51,7 @@ describe('gc', function () {

let ipfsd
let ipfs
let lockEmitter

before(function (done) {
this.timeout(40 * 1000)
Expand All @@ -48,6 +71,11 @@ describe('gc', function () {
ipfsd = node
ipfs = ipfsd.api

// Replace the Mutex with one that emits events when a readLock or
// writeLock is requested (needed in the tests below)
ipfs._gcLock.mutex = new MutexEmitter(ipfs._options.repoOwner)
lockEmitter = ipfs._gcLock.mutex.emitter

done()
})
})
Expand Down Expand Up @@ -79,13 +107,13 @@ describe('gc', function () {
it(`garbage collection should wait for pending ${test.name} to finish`, async () => {
// Add blocks to IPFS
// Note: add operation will take a read lock
const addLockRequested = pEvent(ipfs._gcLock, 'readLock request')
const addLockRequested = pEvent(lockEmitter, 'readLock request')
const add1 = test.add1()

// Once add lock has been requested, start GC
await addLockRequested
// Note: GC will take a write lock
const gcStarted = pEvent(ipfs._gcLock, 'writeLock request')
const gcStarted = pEvent(lockEmitter, 'writeLock request')
const gc = ipfs.repo.gc()

// Once GC has started, start second add
Expand All @@ -109,13 +137,13 @@ describe('gc', function () {
it('garbage collection should wait for pending add + pin to finish', async () => {
// Add blocks to IPFS
// Note: add operation will take a read lock
const addLockRequested = pEvent(ipfs._gcLock, 'readLock request')
const addLockRequested = pEvent(lockEmitter, 'readLock request')
const add1 = ipfs.add(fixtures[2], { pin: true })

// Once add lock has been requested, start GC
await addLockRequested
// Note: GC will take a write lock
const gcStarted = pEvent(ipfs._gcLock, 'writeLock request')
const gcStarted = pEvent(lockEmitter, 'writeLock request')
const gc = ipfs.repo.gc()

// Once GC has started, start second add
Expand All @@ -142,13 +170,13 @@ describe('gc', function () {

// Remove first block from IPFS
// Note: block rm will take a write lock
const rmLockRequested = pEvent(ipfs._gcLock, 'writeLock request')
const rmLockRequested = pEvent(lockEmitter, 'writeLock request')
const rm1 = ipfs.block.rm(cid1)

// Once rm lock has been requested, start GC
await rmLockRequested
// Note: GC will take a write lock
const gcStarted = pEvent(ipfs._gcLock, 'writeLock request')
const gcStarted = pEvent(lockEmitter, 'writeLock request')
const gc = ipfs.repo.gc()

// Once GC has started, start second rm
Expand Down Expand Up @@ -185,7 +213,7 @@ describe('gc', function () {

// Pin first block
// Note: pin add will take a read lock
const pinLockRequested = pEvent(ipfs._gcLock, 'readLock request')
const pinLockRequested = pEvent(lockEmitter, 'readLock request')
const pin1 = ipfs.pin.add(cid1)

// Once pin lock has been requested, start GC
Expand Down Expand Up @@ -222,13 +250,13 @@ describe('gc', function () {

// Unpin first block
// Note: pin rm will take a read lock
const pinLockRequested = pEvent(ipfs._gcLock, 'readLock request')
const pinLockRequested = pEvent(lockEmitter, 'readLock request')
const pinRm1 = ipfs.pin.rm(cid1)

// Once pin lock has been requested, start GC
await pinLockRequested
// Note: GC will take a write lock
const gcStarted = pEvent(ipfs._gcLock, 'writeLock request')
const gcStarted = pEvent(lockEmitter, 'writeLock request')
const gc = ipfs.repo.gc()

// Once GC has started, start second pin rm
Expand Down