Skip to content

Commit

Permalink
feat(store/v2): Add Pruning Tests & Fix SQLite & PebbleDB Pruning (#1…
Browse files Browse the repository at this point in the history
  • Loading branch information
alexanderbez authored Nov 17, 2023
1 parent c0ebebb commit c207163
Show file tree
Hide file tree
Showing 11 changed files with 351 additions and 55 deletions.
17 changes: 14 additions & 3 deletions store/errors.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package store

import (
"fmt"

"cosmossdk.io/errors"
)

Expand Down Expand Up @@ -32,7 +34,16 @@ var (
ErrClosed = errors.Register(StoreCodespace, 8, "closed")
ErrRecordNotFound = errors.Register(StoreCodespace, 9, "record not found")
ErrUnknownStoreKey = errors.Register(StoreCodespace, 10, "unknown store key")
ErrInvalidVersion = errors.Register(StoreCodespace, 11, "invalid version")
ErrKeyEmpty = errors.Register(StoreCodespace, 12, "key empty")
ErrStartAfterEnd = errors.Register(StoreCodespace, 13, "start key after end key")
ErrKeyEmpty = errors.Register(StoreCodespace, 11, "key empty")
ErrStartAfterEnd = errors.Register(StoreCodespace, 12, "start key after end key")
)

// ErrVersionPruned defines an error returned when a version queried is pruned
// or does not exist.
type ErrVersionPruned struct {
EarliestVersion uint64
}

func (e ErrVersionPruned) Error() string {
return fmt.Sprintf("requested version is pruned; earliest available version is: %d", e.EarliestVersion)
}
8 changes: 8 additions & 0 deletions store/pruning/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ func (s *PruningTestSuite) TearDownTest() {
}

func (s *PruningTestSuite) TestPruning() {
s.T().SkipNow()

s.manager.SetCommitmentOptions(Options{4, 2, true})
s.manager.SetStorageOptions(Options{3, 3, false})
s.manager.Start()
Expand All @@ -53,12 +55,16 @@ func (s *PruningTestSuite) TestPruning() {
// write 10 batches
for i := uint64(0); i < latestVersion; i++ {
version := i + 1

cs := store.NewChangeset()
cs.Add([]byte("key"), []byte(fmt.Sprintf("value%d", version)))

err := s.sc.WriteBatch(cs)
s.Require().NoError(err)

_, err = s.sc.Commit()
s.Require().NoError(err)

err = s.ss.ApplyChangeset(version, cs)
s.Require().NoError(err)
s.manager.Prune(version)
Expand All @@ -71,6 +77,7 @@ func (s *PruningTestSuite) TestPruning() {
val, err := s.ss.Get("", latestVersion-4, []byte("key"))
s.Require().NoError(err)
s.Require().Equal([]byte("value96"), val)

// check the store for the version 50
val, err = s.ss.Get("", 50, []byte("key"))
s.Require().NoError(err)
Expand All @@ -80,6 +87,7 @@ func (s *PruningTestSuite) TestPruning() {
proof, err := s.sc.GetProof(latestVersion-4, []byte("key"))
s.Require().NoError(err)
s.Require().NotNil(proof.GetExist())

// check the commitment for the version 95
proof, err = s.sc.GetProof(latestVersion-5, []byte("key"))
s.Require().Error(err)
Expand Down
175 changes: 160 additions & 15 deletions store/storage/pebbledb/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,13 @@ import (

const (
VersionSize = 8
// PruneCommitBatchSize defines the size, in number of key/value pairs, to prune
// in a single batch.
PruneCommitBatchSize = 50

StorePrefixTpl = "s/k:%s/" // s/k:<storeKey>
latestVersionKey = "s/_latest" // NB: latestVersionKey key must be lexically smaller than StorePrefixTpl
StorePrefixTpl = "s/k:%s/" // s/k:<storeKey>
latestVersionKey = "s/_latest" // NB: latestVersionKey key must be lexically smaller than StorePrefixTpl
pruneHeightKey = "s/_prune_height" // NB: pruneHeightKey key must be lexically smaller than StorePrefixTpl
tombstoneVal = "TOMBSTONE"
)

Expand All @@ -26,6 +30,10 @@ var _ store.VersionedDatabase = (*Database)(nil)
type Database struct {
storage *pebble.DB

// earliestVersion defines the earliest version set in the database, which is
// only updated when the database is pruned.
earliestVersion uint64

// Sync is whether to sync writes through the OS buffer cache and down onto
// the actual disk, if applicable. Setting Sync is required for durability of
// individual write operations but can result in slower writes.
Expand All @@ -49,19 +57,35 @@ func New(dataDir string) (*Database, error) {
return nil, fmt.Errorf("failed to open PebbleDB: %w", err)
}

pruneHeight, err := getPruneHeight(db)
if err != nil {
return nil, fmt.Errorf("failed to get prune height: %w", err)
}

return &Database{
storage: db,
sync: true,
storage: db,
earliestVersion: pruneHeight + 1,
sync: true,
}, nil
}

func NewWithDB(storage *pebble.DB, sync bool) *Database {
pruneHeight, err := getPruneHeight(storage)
if err != nil {
panic(fmt.Errorf("failed to get prune height: %w", err))
}

return &Database{
storage: storage,
sync: sync,
storage: storage,
earliestVersion: pruneHeight + 1,
sync: sync,
}
}

func (db *Database) SetSync(sync bool) {
db.sync = sync
}

func (db *Database) Close() error {
err := db.storage.Close()
db.storage = nil
Expand All @@ -71,6 +95,7 @@ func (db *Database) Close() error {
func (db *Database) SetLatestVersion(version uint64) error {
var ts [VersionSize]byte
binary.LittleEndian.PutUint64(ts[:], version)

return db.storage.Set([]byte(latestVersionKey), ts[:], &pebble.WriteOptions{Sync: db.sync})
}

Expand All @@ -92,6 +117,15 @@ func (db *Database) GetLatestVersion() (uint64, error) {
return binary.LittleEndian.Uint64(bz), closer.Close()
}

func (db *Database) setPruneHeight(pruneVersion uint64) error {
db.earliestVersion = pruneVersion + 1

var ts [VersionSize]byte
binary.LittleEndian.PutUint64(ts[:], pruneVersion)

return db.storage.Set([]byte(pruneHeightKey), ts[:], &pebble.WriteOptions{Sync: db.sync})
}

func (db *Database) Has(storeKey string, version uint64, key []byte) (bool, error) {
val, err := db.Get(storeKey, version, key)
if err != nil {
Expand All @@ -102,6 +136,10 @@ func (db *Database) Has(storeKey string, version uint64, key []byte) (bool, erro
}

func (db *Database) Get(storeKey string, targetVersion uint64, key []byte) ([]byte, error) {
if targetVersion < db.earliestVersion {
return nil, store.ErrVersionPruned{EarliestVersion: db.earliestVersion}
}

prefixedVal, err := getMVCCSlice(db.storage, storeKey, key, targetVersion)
if err != nil {
if errors.Is(err, store.ErrRecordNotFound) {
Expand All @@ -126,9 +164,6 @@ func (db *Database) Get(storeKey string, targetVersion uint64, key []byte) ([]by
if err != nil {
return nil, fmt.Errorf("failed to decode value tombstone: %w", err)
}
if tombstone > targetVersion {
return nil, fmt.Errorf("value tombstone too large: %d", tombstone)
}

// A tombstone of zero or a target version that is less than the tombstone
// version means the key is not deleted at the target version.
Expand Down Expand Up @@ -161,13 +196,84 @@ func (db *Database) ApplyChangeset(version uint64, cs *store.Changeset) error {
return b.Write()
}

// Prune for the PebbleDB SS backend is currently not supported. It seems the only
// reliable way to prune is to iterate over the desired domain and either manually
// tombstone or delete. Either way, the operation would be timely.
// Prune removes all versions of all keys that are <= the given version.
//
// Note, the implementation of this method is inefficient and can be potentially
// time consuming given the size of the database and when the last pruning occurred
// (if any). This is because the implementation iterates over all keys in the
// database in order to delete them.
//
// See: https://github.com/cockroachdb/cockroach/blob/33623e3ee420174a4fd3226d1284b03f0e3caaac/pkg/storage/mvcc.go#L3182
func (db *Database) Prune(version uint64) error {
panic("not implemented!")
itr, err := db.storage.NewIter(&pebble.IterOptions{LowerBound: []byte("s/k:")})
if err != nil {
return err
}
defer itr.Close()

batch := db.storage.NewBatch()
defer batch.Close()

var (
batchCounter int
prevKey, prevKeyPrefixed, prevPrefixedVal []byte
prevKeyVersion uint64
)

for itr.First(); itr.Valid(); {
prefixedKey := slices.Clone(itr.Key())

keyBz, verBz, ok := SplitMVCCKey(prefixedKey)
if !ok {
return fmt.Errorf("invalid PebbleDB MVCC key: %s", prefixedKey)
}

keyVersion, err := decodeUint64Ascending(verBz)
if err != nil {
return fmt.Errorf("failed to decode key version: %w", err)
}

// seek to next key if we are at a version which is higher than prune height
if keyVersion > version {
itr.NextPrefix()
continue
}

// Delete a key if another entry for that key exists a larger version than
// the original but <= to the prune height. We also delete a key if it has
// been tombstoned and its version is <= to the prune height.
if prevKeyVersion <= version && (bytes.Equal(prevKey, keyBz) || valTombstoned(prevPrefixedVal)) {
if err := batch.Delete(prevKeyPrefixed, nil); err != nil {
return err
}

batchCounter++
if batchCounter >= PruneCommitBatchSize {
if err := batch.Commit(&pebble.WriteOptions{Sync: db.sync}); err != nil {
return err
}

batchCounter = 0
batch.Reset()
}
}

prevKey = keyBz
prevKeyVersion = keyVersion
prevKeyPrefixed = prefixedKey
prevPrefixedVal = slices.Clone(itr.Value())

itr.Next()
}

// commit any leftover delete ops in batch
if batchCounter > 0 {
if err := batch.Commit(&pebble.WriteOptions{Sync: db.sync}); err != nil {
return err
}
}

return db.setPruneHeight(version)
}

func (db *Database) Iterator(storeKey string, version uint64, start, end []byte) (store.Iterator, error) {
Expand All @@ -191,7 +297,7 @@ func (db *Database) Iterator(storeKey string, version uint64, start, end []byte)
return nil, fmt.Errorf("failed to create PebbleDB iterator: %w", err)
}

return newPebbleDBIterator(itr, storePrefix(storeKey), start, end, version, false), nil
return newPebbleDBIterator(itr, storePrefix(storeKey), start, end, version, db.earliestVersion, false), nil
}

func (db *Database) ReverseIterator(storeKey string, version uint64, start, end []byte) (store.Iterator, error) {
Expand All @@ -215,7 +321,7 @@ func (db *Database) ReverseIterator(storeKey string, version uint64, start, end
return nil, fmt.Errorf("failed to create PebbleDB iterator: %w", err)
}

return newPebbleDBIterator(itr, storePrefix(storeKey), start, end, version, true), nil
return newPebbleDBIterator(itr, storePrefix(storeKey), start, end, version, db.earliestVersion, true), nil
}

func storePrefix(storeKey string) []byte {
Expand All @@ -226,6 +332,45 @@ func prependStoreKey(storeKey string, key []byte) []byte {
return append(storePrefix(storeKey), key...)
}

func getPruneHeight(storage *pebble.DB) (uint64, error) {
bz, closer, err := storage.Get([]byte(pruneHeightKey))
if err != nil {
if errors.Is(err, pebble.ErrNotFound) {
// in cases where pruning was never triggered
return 0, nil
}

return 0, err
}

if len(bz) == 0 {
return 0, closer.Close()
}

return binary.LittleEndian.Uint64(bz), closer.Close()
}

func valTombstoned(value []byte) bool {
if value == nil {
return false
}

_, tombBz, ok := SplitMVCCKey(value)
if !ok {
// XXX: This should not happen as that would indicate we have a malformed
// MVCC value.
panic(fmt.Sprintf("invalid PebbleDB MVCC value: %s", value))
}

// If the tombstone suffix is empty, we consider this a zero value and thus it
// is not tombstoned.
if len(tombBz) == 0 {
return false
}

return true
}

func getMVCCSlice(db *pebble.DB, storeKey string, key []byte, version uint64) ([]byte, error) {
// end domain is exclusive, so we need to increment the version by 1
if version < math.MaxUint64 {
Expand Down
12 changes: 8 additions & 4 deletions store/storage/pebbledb/db_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,16 @@ import (
func TestStorageTestSuite(t *testing.T) {
s := &storage.StorageTestSuite{
NewDB: func(dir string) (store.VersionedDatabase, error) {
return New(dir)
db, err := New(dir)
if err == nil && db != nil {
// We set sync=false just to speed up CI tests. Operators should take
// careful consideration when setting this value in production environments.
db.SetSync(false)
}

return db, err
},
EmptyBatchSize: 12,
SkipTests: []string{
"TestStorageTestSuite/TestDatabase_Prune",
},
}

suite.Run(t, s)
Expand Down
14 changes: 13 additions & 1 deletion store/storage/pebbledb/iterator.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,19 @@ type iterator struct {
reverse bool
}

func newPebbleDBIterator(src *pebble.Iterator, prefix, mvccStart, mvccEnd []byte, version uint64, reverse bool) *iterator {
func newPebbleDBIterator(src *pebble.Iterator, prefix, mvccStart, mvccEnd []byte, version, earliestVersion uint64, reverse bool) *iterator {
if version < earliestVersion {
return &iterator{
source: src,
prefix: prefix,
start: mvccStart,
end: mvccEnd,
version: version,
valid: false,
reverse: reverse,
}
}

// move the underlying PebbleDB iterator to the first key
var valid bool
if reverse {
Expand Down
Loading

0 comments on commit c207163

Please sign in to comment.