Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into pr/6648
Browse files Browse the repository at this point in the history
  • Loading branch information
pdillinger committed Apr 10, 2020
2 parents d31e395 + f08630b commit 43eee91
Show file tree
Hide file tree
Showing 40 changed files with 1,444 additions and 80 deletions.
5 changes: 5 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,11 @@ matrix:
- os: linux
compiler: clang
# Exclude most osx, arm64 and ppc64le tests for pull requests, but build in branches
# Temporarily disable ppc64le unit tests in PRs until Travis gets its act together (#6653)
- if: type = pull_request
os: linux
arch: ppc64le
env: TEST_GROUP=platform_dependent
# NB: the cmake build is a partial java test
- if: type = pull_request
os: osx
Expand Down
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -646,6 +646,7 @@ set(SOURCES
memory/arena.cc
memory/concurrent_arena.cc
memory/jemalloc_nodump_allocator.cc
memory/memkind_kmem_allocator.cc
memtable/alloc_tracker.cc
memtable/hash_linklist_rep.cc
memtable/hash_skiplist_rep.cc
Expand Down Expand Up @@ -1079,6 +1080,7 @@ if(WITH_TESTS)
logging/env_logger_test.cc
logging/event_logger_test.cc
memory/arena_test.cc
memory/memkind_kmem_allocator_test.cc
memtable/inlineskiplist_test.cc
memtable/skiplist_test.cc
memtable/write_buffer_manager_test.cc
Expand Down Expand Up @@ -1120,6 +1122,7 @@ if(WITH_TESTS)
util/slice_test.cc
util/slice_transform_test.cc
util/timer_queue_test.cc
util/timer_test.cc
util/thread_list_test.cc
util/thread_local_test.cc
util/work_queue_test.cc
Expand All @@ -1142,6 +1145,7 @@ if(WITH_TESTS)
utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
utilities/transactions/optimistic_transaction_test.cc
utilities/transactions/transaction_test.cc
utilities/transactions/transaction_lock_mgr_test.cc
utilities/transactions/write_prepared_transaction_test.cc
utilities/transactions/write_unprepared_transaction_test.cc
utilities/ttl/ttl_test.cc
Expand Down
8 changes: 8 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
# Rocksdb Change Log
## Unreleased
### Bug Fixes
* Fix wrong result being read from ingested file. May happen when a key in the file happen to be prefix of another key also in the file. The issue can further cause more data corruption. The issue exists with rocksdb >= 5.0.0 since DB::IngestExternalFile() was introduced.

### New Features
* Added support for pipelined & parallel compression optimization for `BlockBasedTableBuilder`. This optimization makes block building, block compression and block appending a pipeline, and uses multiple threads to accelerate block compression. Users can set `CompressionOptions::parallel_threads` greater than 1 to enable compression parallelism.

### Bug Fixes
* Fix a bug when making options.bottommost_compression, options.compression_opts and options.bottommost_compression_opts dynamically changeable: the modified values are not written to option files or returned back to users when being queried.

## 6.9.0 (03/29/2020)
### Behavior changes
* Since RocksDB 6.8, ttl-based FIFO compaction can drop a file whose oldest key becomes older than options.ttl while others have not. This fix reverts this and makes ttl-based FIFO compaction use the file's flush time as the criterion. This fix also requires that max_open_files = -1 and compaction_options_fifo.allow_compaction = false to function properly.
Expand All @@ -21,6 +27,8 @@

### Performance Improvements
* In CompactRange, for levels starting from 0, if the level does not have any file with any key falling in the specified range, the level is skipped. So instead of always compacting from level 0, the compaction starts from the first level with keys in the specified range until the last such level.
* Reduced memory copy when reading sst footer and blobdb in direct IO mode.
* When restarting a database with large numbers of sst files, large amount of CPU time is spent on getting logical block size of the sst files, which slows down the starting progress, this inefficiency is optimized away with an internal cache for the logical block sizes.

### New Features
* Basic support for user timestamp in iterator. Seek/SeekToFirst/Next and lower/upper bounds are supported. Reverse iteration is not supported. Merge is not considered.
Expand Down
13 changes: 13 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,7 @@ TESTS = \
column_family_test \
table_properties_collector_test \
arena_test \
memkind_kmem_allocator_test \
block_test \
data_block_hash_index_test \
cache_test \
Expand Down Expand Up @@ -584,6 +585,7 @@ TESTS = \
compaction_job_stats_test \
option_change_migration_test \
transaction_test \
transaction_lock_mgr_test \
ldb_cmd_test \
persistent_cache_test \
statistics_test \
Expand All @@ -606,6 +608,7 @@ TESTS = \
defer_test \
blob_file_addition_test \
blob_file_garbage_test \
timer_test \

ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1)
TESTS += folly_synchronization_distributed_mutex_test
Expand All @@ -630,6 +633,7 @@ PARALLEL_TEST = \
persistent_cache_test \
table_test \
transaction_test \
transaction_lock_mgr_test \
write_prepared_transaction_test \
write_unprepared_transaction_test \

Expand Down Expand Up @@ -1238,6 +1242,9 @@ db_repl_stress: tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL)

arena_test: memory/arena_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK)

memkind_kmem_allocator_test: memory/memkind_kmem_allocator_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK)

autovector_test: util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK)
Expand Down Expand Up @@ -1673,6 +1680,9 @@ write_callback_test: db/write_callback_test.o $(LIBOBJECTS) $(TESTHARNESS)
heap_test: util/heap_test.o $(GTEST)
$(AM_LINK)

transaction_lock_mgr_test: utilities/transactions/transaction_lock_mgr_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK)

transaction_test: utilities/transactions/transaction_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK)

Expand Down Expand Up @@ -1748,6 +1758,9 @@ blob_file_addition_test: db/blob/blob_file_addition_test.o $(LIBOBJECTS) $(TESTH
blob_file_garbage_test: db/blob/blob_file_garbage_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK)

timer_test: util/timer_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK)

#-------------------------------------------------
# make install related stuff
INSTALL_PATH ?= /usr/local
Expand Down
14 changes: 14 additions & 0 deletions TARGETS
Original file line number Diff line number Diff line change
Expand Up @@ -1456,13 +1456,27 @@ ROCKS_TESTS = [
[],
[],
],
[
"timer_test",
"util/timer_test.cc",
"serial",
[],
[],
],
[
"trace_analyzer_test",
"tools/trace_analyzer_test.cc",
"serial",
[],
[],
],
[
"transaction_lock_mgr_test",
"utilities/transactions/transaction_lock_mgr_test.cc",
"parallel",
[],
[],
],
[
"transaction_test",
"utilities/transactions/transaction_test.cc",
Expand Down
19 changes: 18 additions & 1 deletion build_tools/build_detect_platform
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
# -DZSTD if the ZSTD library is present
# -DNUMA if the NUMA library is present
# -DTBB if the TBB library is present
# -DMEMKIND if the memkind library is present
#
# Using gflags in rocksdb:
# Our project depends on gflags, which requires users to take some extra steps
Expand Down Expand Up @@ -425,7 +426,23 @@ EOF
COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_MALLOC_USABLE_SIZE"
fi
fi


if ! test $ROCKSDB_DISABLE_MEMKIND; then
# Test whether memkind library is installed
$CXX $CFLAGS $COMMON_FLAGS -lmemkind -x c++ - -o /dev/null 2>/dev/null <<EOF
#include <memkind.h>
int main() {
memkind_malloc(MEMKIND_DAX_KMEM, 1024);
return 0;
}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DMEMKIND"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lmemkind"
JAVA_LDFLAGS="$JAVA_LDFLAGS -lmemkind"
fi
fi

if ! test $ROCKSDB_DISABLE_PTHREAD_MUTEX_ADAPTIVE_NP; then
# Test whether PTHREAD_MUTEX_ADAPTIVE_NP mutex type is available
$CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
Expand Down
11 changes: 6 additions & 5 deletions build_tools/setup_centos7.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
set -e
set -ex

ROCKSDB_VERSION="5.10.3"
ZSTD_VERSION="1.1.3"
ROCKSDB_VERSION="6.7.3"
ZSTD_VERSION="1.4.4"

echo "This script configures CentOS with everything needed to build and run RocksDB"

Expand Down Expand Up @@ -40,5 +40,6 @@ cd /usr/local/rocksdb
chown -R vagrant:vagrant /usr/local/rocksdb/
sudo -u vagrant make static_lib
cd examples/
sudo -u vagrant make all
sudo -u vagrant ./c_simple_example
sudo -u vagrant LD_LIBRARY_PATH=/usr/local/lib/ make all
sudo -u vagrant LD_LIBRARY_PATH=/usr/local/lib/ ./c_simple_example

10 changes: 10 additions & 0 deletions db/c.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4441,6 +4441,16 @@ uint64_t rocksdb_approximate_memory_usage_get_cache_total(
return memory_usage->cache_total;
}

void rocksdb_options_set_dump_malloc_stats(rocksdb_options_t* opt,
unsigned char val) {
opt->rep.dump_malloc_stats = val;
}

void rocksdb_options_set_memtable_whole_key_filtering(rocksdb_options_t* opt,
unsigned char val) {
opt->rep.memtable_whole_key_filtering = val;
}

// deletes container with memory usage estimates
void rocksdb_approximate_memory_usage_destroy(rocksdb_memory_usage_t* usage) {
delete usage;
Expand Down
10 changes: 5 additions & 5 deletions db/db_impl/db_impl_compaction_flush.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2549,8 +2549,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
"[%s] Manual compaction from level-%d from %s .. "
"%s; nothing to do\n",
m->cfd->GetName().c_str(), m->input_level,
(m->begin ? m->begin->DebugString().c_str() : "(begin)"),
(m->end ? m->end->DebugString().c_str() : "(end)"));
(m->begin ? m->begin->DebugString(true).c_str() : "(begin)"),
(m->end ? m->end->DebugString(true).c_str() : "(end)"));
} else {
// First check if we have enough room to do the compaction
bool enough_room = EnoughRoomForCompaction(
Expand All @@ -2569,11 +2569,11 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
"[%s] Manual compaction from level-%d to level-%d from %s .. "
"%s; will stop at %s\n",
m->cfd->GetName().c_str(), m->input_level, c->output_level(),
(m->begin ? m->begin->DebugString().c_str() : "(begin)"),
(m->end ? m->end->DebugString().c_str() : "(end)"),
(m->begin ? m->begin->DebugString(true).c_str() : "(begin)"),
(m->end ? m->end->DebugString(true).c_str() : "(end)"),
((m->done || m->manual_end == nullptr)
? "(end)"
: m->manual_end->DebugString().c_str()));
: m->manual_end->DebugString(true).c_str()));
}
}
} else if (!is_prepicked && !compaction_queue_.empty()) {
Expand Down
48 changes: 27 additions & 21 deletions db/db_iter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
cfd_(cfd),
start_seqnum_(read_options.iter_start_seqnum),
timestamp_ub_(read_options.timestamp),
timestamp_lb_(read_options.iter_start_ts),
timestamp_size_(timestamp_ub_ ? timestamp_ub_->size() : 0) {
RecordTick(statistics_, NO_ITERATOR_CREATED);
if (pin_thru_lifetime_) {
Expand Down Expand Up @@ -246,23 +247,22 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,

assert(ikey_.user_key.size() >= timestamp_size_);
Slice ts;
bool more_recent = false;
if (timestamp_size_ > 0) {
ts = ExtractTimestampFromUserKey(ikey_.user_key, timestamp_size_);
}
if (IsVisible(ikey_.sequence, ts)) {
if (IsVisible(ikey_.sequence, ts, &more_recent)) {
// If the previous entry is of seqnum 0, the current entry will not
// possibly be skipped. This condition can potentially be relaxed to
// prev_key.seq <= ikey_.sequence. We are cautious because it will be more
// prone to bugs causing the same user key with the same sequence number.
if (!is_prev_key_seqnum_zero && skipping_saved_key &&
user_comparator_.CompareWithoutTimestamp(
ikey_.user_key, saved_key_.GetUserKey()) <= 0) {
CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) <= 0) {
num_skipped++; // skip this entry
PERF_COUNTER_ADD(internal_key_skipped_count, 1);
} else {
assert(!skipping_saved_key ||
user_comparator_.CompareWithoutTimestamp(
ikey_.user_key, saved_key_.GetUserKey()) > 0);
CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) > 0);
num_skipped = 0;
reseek_done = false;
switch (ikey_.type) {
Expand Down Expand Up @@ -363,11 +363,13 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
}
}
} else {
PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
if (more_recent) {
PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
}

// This key was inserted after our snapshot was taken.
// If this happens too many times in a row for the same user key, we want
// to seek to the target sequence number.
// This key was inserted after our snapshot was taken or skipped by
// timestamp range. If this happens too many times in a row for the same
// user key, we want to seek to the target sequence number.
int cmp = user_comparator_.CompareWithoutTimestamp(
ikey_.user_key, saved_key_.GetUserKey());
if (cmp == 0 || (skipping_saved_key && cmp < 0)) {
Expand Down Expand Up @@ -1101,20 +1103,24 @@ bool DBIter::TooManyInternalKeysSkipped(bool increment) {
return false;
}

bool DBIter::IsVisible(SequenceNumber sequence, const Slice& ts) {
bool DBIter::IsVisible(SequenceNumber sequence, const Slice& ts,
bool* more_recent) {
// Remember that comparator orders preceding timestamp as larger.
int cmp_ts = timestamp_ub_ != nullptr
? user_comparator_.CompareTimestamp(ts, *timestamp_ub_)
: 0;
if (cmp_ts > 0) {
return false;
}
if (read_callback_ == nullptr) {
return sequence <= sequence_;
} else {
// TODO(yanqin): support timestamp in read_callback_.
return read_callback_->IsVisible(sequence);
// TODO(yanqin): support timestamp in read_callback_.
bool visible_by_seq = (read_callback_ == nullptr)
? sequence <= sequence_
: read_callback_->IsVisible(sequence);

bool visible_by_ts =
(timestamp_ub_ == nullptr ||
user_comparator_.CompareTimestamp(ts, *timestamp_ub_) <= 0) &&
(timestamp_lb_ == nullptr ||
user_comparator_.CompareTimestamp(ts, *timestamp_lb_) >= 0);

if (more_recent) {
*more_recent = !visible_by_seq;
}
return visible_by_seq && visible_by_ts;
}

void DBIter::SetSavedKeyToSeekTarget(const Slice& target) {
Expand Down
13 changes: 12 additions & 1 deletion db/db_iter.h
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,8 @@ class DBIter final : public Iterator {
// entry can be found within the prefix.
void PrevInternal(const Slice* prefix);
bool TooManyInternalKeysSkipped(bool increment = true);
bool IsVisible(SequenceNumber sequence, const Slice& ts);
bool IsVisible(SequenceNumber sequence, const Slice& ts,
bool* more_recent = nullptr);

// Temporarily pin the blocks that we encounter until ReleaseTempPinnedData()
// is called
Expand Down Expand Up @@ -270,6 +271,15 @@ class DBIter final : public Iterator {
return expect_total_order_inner_iter_;
}

// If lower bound of timestamp is given by ReadOptions.iter_start_ts, we need
// to return versions of the same key. We cannot just skip if the key value
// is the same but timestamps are different but fall in timestamp range.
inline int CompareKeyForSkip(const Slice& a, const Slice& b) {
return timestamp_lb_ != nullptr
? user_comparator_.Compare(a, b)
: user_comparator_.CompareWithoutTimestamp(a, b);
}

const SliceTransform* prefix_extractor_;
Env* const env_;
Logger* logger_;
Expand Down Expand Up @@ -338,6 +348,7 @@ class DBIter final : public Iterator {
// if this value > 0 iterator will return internal keys
SequenceNumber start_seqnum_;
const Slice* const timestamp_ub_;
const Slice* const timestamp_lb_;
const size_t timestamp_size_;
};

Expand Down
Loading

0 comments on commit 43eee91

Please sign in to comment.