Skip to content

Commit

Permalink
Introduce ReadOptions::pin_data (support zero copy for keys)
Browse files Browse the repository at this point in the history
Summary:
This patch update the Iterator API to introduce new functions that allow users to keep the Slices returned by key() valid as long as the Iterator is not deleted

ReadOptions::pin_data : If true keep loaded blocks in memory as long as the iterator is not deleted
Iterator::IsKeyPinned() : If true, this mean that the Slice returned by key() is valid as long as the iterator is not deleted

Also add a new option BlockBasedTableOptions::use_delta_encoding to allow users to disable delta_encoding if needed.

Benchmark results (using https://phabricator.fb.com/P20083553)

```
// $ du -h /home/tec/local/normal.4K.Snappy/db10077
// 6.1G    /home/tec/local/normal.4K.Snappy/db10077

// $ du -h /home/tec/local/zero.8K.LZ4/db10077
// 6.4G    /home/tec/local/zero.8K.LZ4/db10077

// Benchmarks for shard db10077
// _build/opt/rocks/benchmark/rocks_copy_benchmark \
//      --normal_db_path="/home/tec/local/normal.4K.Snappy/db10077" \
//      --zero_db_path="/home/tec/local/zero.8K.LZ4/db10077"

// First run
// ============================================================================
// rocks/benchmark/RocksCopyBenchmark.cpp          relative  time/iter  iters/s
// ============================================================================
// BM_StringCopy                                                 1.73s  576.97m
// BM_StringPiece                                   103.74%      1.67s  598.55m
// ============================================================================
// Match rate : 1000000 / 1000000

// Second run
// ============================================================================
// rocks/benchmark/RocksCopyBenchmark.cpp          relative  time/iter  iters/s
// ============================================================================
// BM_StringCopy                                              611.99ms     1.63
// BM_StringPiece                                   203.76%   300.35ms     3.33
// ============================================================================
// Match rate : 1000000 / 1000000
```

Test Plan: Unit tests

Reviewers: sdong, igor, anthony, yhchiang, rven

Reviewed By: rven

Subscribers: dhruba, lovro, adsharma

Differential Revision: https://reviews.facebook.net/D48999
  • Loading branch information
IslamAbdelRahman committed Dec 16, 2015
1 parent e6e505a commit aececc2
Show file tree
Hide file tree
Showing 21 changed files with 666 additions and 67 deletions.
18 changes: 11 additions & 7 deletions db/db_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3828,7 +3828,8 @@ Iterator* DBImpl::NewIterator(const ReadOptions& read_options,
env_, *cfd->ioptions(), cfd->user_comparator(), iter,
kMaxSequenceNumber,
sv->mutable_cf_options.max_sequential_skip_in_iterations,
read_options.iterate_upper_bound, read_options.prefix_same_as_start);
read_options.iterate_upper_bound, read_options.prefix_same_as_start,
read_options.pin_data);
#endif
} else {
SequenceNumber latest_snapshot = versions_->LastSequence();
Expand Down Expand Up @@ -3885,7 +3886,8 @@ Iterator* DBImpl::NewIterator(const ReadOptions& read_options,
ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
env_, *cfd->ioptions(), cfd->user_comparator(), snapshot,
sv->mutable_cf_options.max_sequential_skip_in_iterations,
read_options.iterate_upper_bound, read_options.prefix_same_as_start);
read_options.iterate_upper_bound, read_options.prefix_same_as_start,
read_options.pin_data);

InternalIterator* internal_iter =
NewInternalIterator(read_options, cfd, sv, db_iter->GetArena());
Expand Down Expand Up @@ -3931,10 +3933,11 @@ Status DBImpl::NewIterators(
auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
auto iter = new ForwardIterator(this, read_options, cfd, sv);
iterators->push_back(
NewDBIterator(env_, *cfd->ioptions(), cfd->user_comparator(), iter,
kMaxSequenceNumber,
sv->mutable_cf_options.max_sequential_skip_in_iterations));
iterators->push_back(NewDBIterator(
env_, *cfd->ioptions(), cfd->user_comparator(), iter,
kMaxSequenceNumber,
sv->mutable_cf_options.max_sequential_skip_in_iterations, nullptr,
false, read_options.pin_data));
}
#endif
} else {
Expand All @@ -3953,7 +3956,8 @@ Status DBImpl::NewIterators(

ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
env_, *cfd->ioptions(), cfd->user_comparator(), snapshot,
sv->mutable_cf_options.max_sequential_skip_in_iterations);
sv->mutable_cf_options.max_sequential_skip_in_iterations, nullptr,
false, read_options.pin_data);
InternalIterator* internal_iter =
NewInternalIterator(read_options, cfd, sv, db_iter->GetArena());
db_iter->SetIterUnderDBIter(internal_iter);
Expand Down
73 changes: 62 additions & 11 deletions db/db_iter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ class DBIter: public Iterator {
current_entry_is_merged_(false),
statistics_(ioptions.statistics),
iterate_upper_bound_(iterate_upper_bound),
prefix_same_as_start_(prefix_same_as_start) {
prefix_same_as_start_(prefix_same_as_start),
iter_pinned_(false) {
RecordTick(statistics_, NO_ITERATORS);
prefix_extractor_ = ioptions.prefix_extractor;
max_skip_ = max_sequential_skip_in_iterations;
Expand All @@ -92,6 +93,9 @@ class DBIter: public Iterator {
virtual void SetIter(InternalIterator* iter) {
assert(iter_ == nullptr);
iter_ = iter;
if (iter_ && iter_pinned_) {
iter_->PinData();
}
}
virtual bool Valid() const override { return valid_; }
virtual Slice key() const override {
Expand All @@ -110,6 +114,32 @@ class DBIter: public Iterator {
return status_;
}
}
virtual Status PinData() {
Status s;
if (iter_) {
s = iter_->PinData();
}
if (s.ok()) {
// Even if iter_ is nullptr, we set iter_pinned_ to true so that when
// iter_ is updated using SetIter, we Pin it.
iter_pinned_ = true;
}
return s;
}
virtual Status ReleasePinnedData() {
Status s;
if (iter_) {
s = iter_->ReleasePinnedData();
}
if (s.ok()) {
iter_pinned_ = false;
}
return s;
}
virtual bool IsKeyPinned() const override {
assert(valid_);
return iter_pinned_ && saved_key_.IsKeyPinned();
}

virtual void Next() override;
virtual void Prev() override;
Expand Down Expand Up @@ -159,6 +189,7 @@ class DBIter: public Iterator {
const Slice* iterate_upper_bound_;
IterKey prefix_start_;
bool prefix_same_as_start_;
bool iter_pinned_;

// No copying allowed
DBIter(const DBIter&);
Expand Down Expand Up @@ -257,18 +288,21 @@ void DBIter::FindNextUserEntryInternal(bool skipping) {
case kTypeSingleDeletion:
// Arrange to skip all upcoming entries for this key since
// they are hidden by this deletion.
saved_key_.SetKey(ikey.user_key);
saved_key_.SetKey(ikey.user_key,
!iter_->IsKeyPinned() /* copy */);
skipping = true;
num_skipped = 0;
PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
break;
case kTypeValue:
valid_ = true;
saved_key_.SetKey(ikey.user_key);
saved_key_.SetKey(ikey.user_key,
!iter_->IsKeyPinned() /* copy */);
return;
case kTypeMerge:
// By now, we are sure the current ikey is going to yield a value
saved_key_.SetKey(ikey.user_key);
saved_key_.SetKey(ikey.user_key,
!iter_->IsKeyPinned() /* copy */);
current_entry_is_merged_ = true;
valid_ = true;
MergeValuesNewToOld(); // Go to a different state machine
Expand Down Expand Up @@ -428,7 +462,8 @@ void DBIter::PrevInternal() {
ParsedInternalKey ikey;

while (iter_->Valid()) {
saved_key_.SetKey(ExtractUserKey(iter_->key()));
saved_key_.SetKey(ExtractUserKey(iter_->key()),
!iter_->IsKeyPinned() /* copy */);
if (FindValueForCurrentKey()) {
valid_ = true;
if (!iter_->Valid()) {
Expand Down Expand Up @@ -744,7 +779,7 @@ void DBIter::SeekToLast() {
// it will seek to the last key before the
// ReadOptions.iterate_upper_bound
if (iter_->Valid() && iterate_upper_bound_ != nullptr) {
saved_key_.SetKey(*iterate_upper_bound_);
saved_key_.SetKey(*iterate_upper_bound_, false /* copy */);
std::string last_key;
AppendInternalKey(&last_key,
ParsedInternalKey(saved_key_.GetKey(), kMaxSequenceNumber,
Expand Down Expand Up @@ -781,10 +816,15 @@ Iterator* NewDBIterator(Env* env, const ImmutableCFOptions& ioptions,
const SequenceNumber& sequence,
uint64_t max_sequential_skip_in_iterations,
const Slice* iterate_upper_bound,
bool prefix_same_as_start) {
return new DBIter(env, ioptions, user_key_comparator, internal_iter, sequence,
false, max_sequential_skip_in_iterations,
iterate_upper_bound, prefix_same_as_start);
bool prefix_same_as_start, bool pin_data) {
DBIter* db_iter =
new DBIter(env, ioptions, user_key_comparator, internal_iter, sequence,
false, max_sequential_skip_in_iterations, iterate_upper_bound,
prefix_same_as_start);
if (pin_data) {
db_iter->PinData();
}
return db_iter;
}

ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~DBIter(); }
Expand All @@ -806,6 +846,13 @@ inline void ArenaWrappedDBIter::Prev() { db_iter_->Prev(); }
inline Slice ArenaWrappedDBIter::key() const { return db_iter_->key(); }
inline Slice ArenaWrappedDBIter::value() const { return db_iter_->value(); }
inline Status ArenaWrappedDBIter::status() const { return db_iter_->status(); }
inline Status ArenaWrappedDBIter::PinData() { return db_iter_->PinData(); }
inline Status ArenaWrappedDBIter::ReleasePinnedData() {
return db_iter_->ReleasePinnedData();
}
inline bool ArenaWrappedDBIter::IsKeyPinned() const {
return db_iter_->IsKeyPinned();
}
void ArenaWrappedDBIter::RegisterCleanup(CleanupFunction function, void* arg1,
void* arg2) {
db_iter_->RegisterCleanup(function, arg1, arg2);
Expand All @@ -815,7 +862,8 @@ ArenaWrappedDBIter* NewArenaWrappedDbIterator(
Env* env, const ImmutableCFOptions& ioptions,
const Comparator* user_key_comparator, const SequenceNumber& sequence,
uint64_t max_sequential_skip_in_iterations,
const Slice* iterate_upper_bound, bool prefix_same_as_start) {
const Slice* iterate_upper_bound, bool prefix_same_as_start,
bool pin_data) {
ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
Arena* arena = iter->GetArena();
auto mem = arena->AllocateAligned(sizeof(DBIter));
Expand All @@ -825,6 +873,9 @@ ArenaWrappedDBIter* NewArenaWrappedDbIterator(
iterate_upper_bound, prefix_same_as_start);

iter->SetDBIter(db_iter);
if (pin_data) {
iter->PinData();
}

return iter;
}
Expand Down
8 changes: 6 additions & 2 deletions db/db_iter.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ extern Iterator* NewDBIterator(Env* env, const ImmutableCFOptions& options,
const SequenceNumber& sequence,
uint64_t max_sequential_skip_in_iterations,
const Slice* iterate_upper_bound = nullptr,
bool prefix_same_as_start = false);
bool prefix_same_as_start = false,
bool pin_data = false);

// A wrapper iterator which wraps DB Iterator and the arena, with which the DB
// iterator is supposed be allocated. This class is used as an entry point of
Expand Down Expand Up @@ -63,6 +64,9 @@ class ArenaWrappedDBIter : public Iterator {
virtual Status status() const override;

void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
virtual Status PinData();
virtual Status ReleasePinnedData();
virtual bool IsKeyPinned() const override;

private:
DBIter* db_iter_;
Expand All @@ -75,6 +79,6 @@ extern ArenaWrappedDBIter* NewArenaWrappedDbIterator(
const Comparator* user_key_comparator, const SequenceNumber& sequence,
uint64_t max_sequential_skip_in_iterations,
const Slice* iterate_upper_bound = nullptr,
bool prefix_same_as_start = false);
bool prefix_same_as_start = false, bool pin_data = false);

} // namespace rocksdb
Loading

0 comments on commit aececc2

Please sign in to comment.