Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Index value delta encoding #3983

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
## Unreleased
### Public API Change
### New Features
* Changes the format of index blocks by delta encoding the index values, which are the block handles. This saves the encoding of BlockHandle::offset of the non-head index entries in each restart interval. The feature is backward compatible but not forward compatible. It is disabled by default unless format_version 4 or above is used.
### Bug Fixes

## 5.15.0 (7/17/2018)
Expand All @@ -13,7 +14,7 @@
* The "rocksdb.num.entries" table property no longer counts range deletion tombstones as entries.

### New Features
* Changes the format of index blocks by storing the key in their raw form rather than converting them to InternalKey. This saves 8 bytes per index key. The feature is backward compatbile but not forward compatible. It is disabled by default unless format_version 3 or above is used.
* Changes the format of index blocks by storing the key in their raw form rather than converting them to InternalKey. This saves 8 bytes per index key. The feature is backward compatible but not forward compatible. It is disabled by default unless format_version 3 or above is used.
* Avoid memcpy when reading mmap files with OpenReadOnly and max_open_files==-1.
* Support dynamically changing `ColumnFamilyOptions::ttl` via `SetOptions()`.
* Add a new table property, "rocksdb.num.range-deletions", which counts the number of range deletion tombstones in the table.
Expand Down
1 change: 0 additions & 1 deletion db/builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ class VersionEdit;
class TableBuilder;
class WritableFileWriter;
class InternalStats;
class InternalIterator;

// @param column_family_name Name of the column family that is also identified
// by column_family_id, or empty string if unknown. It must outlive the
Expand Down
2 changes: 1 addition & 1 deletion db/db_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1047,7 +1047,7 @@ InternalIterator* DBImpl::NewInternalIterator(
} else {
CleanupSuperVersion(super_version);
}
return NewErrorInternalIterator(s, arena);
return NewErrorInternalIterator<Slice>(s, arena);
}

ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const {
Expand Down
1 change: 0 additions & 1 deletion db/db_iter.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ namespace rocksdb {

class Arena;
class DBIter;
class InternalIterator;

// Return a new iterator that converts internal keys (yielded by
// "*internal_iter") that were live at the specified "sequence" number
Expand Down
39 changes: 20 additions & 19 deletions db/db_properties_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -180,17 +180,16 @@ void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) {
ResetTableProperties(tp);
sscanf(tp_string.c_str(),
"# data blocks %" SCNu64 " # entries %" SCNu64
" # range deletions %" SCNu64
" raw key size %" SCNu64
" # range deletions %" SCNu64 " raw key size %" SCNu64
" raw average key size %lf "
" raw value size %" SCNu64
" raw average value size %lf "
" data block size %" SCNu64 " index block size (user-key? %" SCNu64
") %" SCNu64 " filter block size %" SCNu64,
", delta-value? %" SCNu64 ") %" SCNu64 " filter block size %" SCNu64,
&tp->num_data_blocks, &tp->num_entries, &tp->num_range_deletions,
&tp->raw_key_size, &dummy_double, &tp->raw_value_size, &dummy_double,
&tp->data_size, &tp->index_key_is_user_key, &tp->index_size,
&tp->filter_size);
&tp->data_size, &tp->index_key_is_user_key,
&tp->index_value_is_delta_encoded, &tp->index_size, &tp->filter_size);
}

void VerifySimilar(uint64_t a, uint64_t b, double bias) {
Expand Down Expand Up @@ -224,14 +223,11 @@ void VerifyTableProperties(const TableProperties& base_tp,
ASSERT_EQ(base_tp.num_range_deletions, new_tp.num_range_deletions);
}

void GetExpectedTableProperties(TableProperties* expected_tp,
const int kKeySize, const int kValueSize,
const int kKeysPerTable,
const int kRangeDeletionsPerTable,
const int kTableCount,
const int kBloomBitsPerKey,
const size_t kBlockSize,
const bool index_key_is_user_key) {
void GetExpectedTableProperties(
TableProperties* expected_tp, const int kKeySize, const int kValueSize,
const int kKeysPerTable, const int kRangeDeletionsPerTable,
const int kTableCount, const int kBloomBitsPerKey, const size_t kBlockSize,
const bool index_key_is_user_key, const bool value_delta_encoding) {
const int kKeyCount = kTableCount * kKeysPerTable;
const int kRangeDeletionCount = kTableCount * kRangeDeletionsPerTable;
const int kAvgSuccessorSize = kKeySize / 5;
Expand All @@ -248,7 +244,9 @@ void GetExpectedTableProperties(TableProperties* expected_tp,
kTableCount * (kKeysPerTable * (kKeySize + 8 + kValueSize));
expected_tp->index_size =
expected_tp->num_data_blocks *
(kAvgSuccessorSize + (index_key_is_user_key ? 0 : 8));
(kAvgSuccessorSize + (index_key_is_user_key ? 0 : 8) -
// discount 1 byte as value size is not encoded in value delta encoding
(value_delta_encoding ? 1 : 0));
expected_tp->filter_size =
kTableCount * (kKeysPerTable * kBloomBitsPerKey / 8);
}
Expand Down Expand Up @@ -342,12 +340,14 @@ TEST_F(DBPropertiesTest, AggregatedTableProperties) {
TableProperties output_tp;
ParseTablePropertiesString(property, &output_tp);
bool index_key_is_user_key = output_tp.index_key_is_user_key > 0;
bool value_is_delta_encoded = output_tp.index_value_is_delta_encoded > 0;

TableProperties expected_tp;
GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize,
kKeysPerTable, kRangeDeletionsPerTable,
kTableCount, kBloomBitsPerKey,
table_options.block_size, index_key_is_user_key);
table_options.block_size, index_key_is_user_key,
value_is_delta_encoded);

VerifyTableProperties(expected_tp, output_tp);
}
Expand Down Expand Up @@ -533,6 +533,7 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
db_->GetProperty(DB::Properties::kAggregatedTableProperties, &tp_string);
ParseTablePropertiesString(tp_string, &tp);
bool index_key_is_user_key = tp.index_key_is_user_key > 0;
bool value_is_delta_encoded = tp.index_value_is_delta_encoded > 0;
ASSERT_EQ(sum_tp.data_size, tp.data_size);
ASSERT_EQ(sum_tp.index_size, tp.index_size);
ASSERT_EQ(sum_tp.filter_size, tp.filter_size);
Expand All @@ -542,10 +543,10 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
ASSERT_EQ(sum_tp.num_entries, tp.num_entries);
ASSERT_EQ(sum_tp.num_range_deletions, tp.num_range_deletions);
if (table > 3) {
GetExpectedTableProperties(
&expected_tp, kKeySize, kValueSize, kKeysPerTable,
kRangeDeletionsPerTable, table, kBloomBitsPerKey,
table_options.block_size, index_key_is_user_key);
GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize,
kKeysPerTable, kRangeDeletionsPerTable, table,
kBloomBitsPerKey, table_options.block_size,
index_key_is_user_key, value_is_delta_encoded);
// Gives larger bias here as index block size, filter block size,
// and data block size become much harder to estimate in this test.
VerifyTableProperties(expected_tp, tp, 0.5, 0.4, 0.4, 0.25);
Expand Down
4 changes: 2 additions & 2 deletions db/db_test_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -449,8 +449,8 @@ Options DBTestBase::GetOptions(
options.prefix_extractor.reset(NewNoopTransform());
break;
}
case kBlockBasedTableWithPartitionedIndexFormat3: {
table_options.format_version = 3;
case kBlockBasedTableWithPartitionedIndexFormat4: {
table_options.format_version = 4;
// Format 3 changes the binary index format. Since partitioned index is a
// super-set of simple indexes, we are also using kTwoLevelIndexSearch to
// test this format.
Expand Down
2 changes: 1 addition & 1 deletion db/db_test_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -698,7 +698,7 @@ class DBTestBase : public testing::Test {
kLevelSubcompactions,
kBlockBasedTableWithIndexRestartInterval,
kBlockBasedTableWithPartitionedIndex,
kBlockBasedTableWithPartitionedIndexFormat3,
kBlockBasedTableWithPartitionedIndexFormat4,
kPartitionedFilterWithNewTableReaderForCompactions,
kUniversalSubcompactions,
// This must be the last line
Expand Down
1 change: 0 additions & 1 deletion db/memtable.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ namespace rocksdb {
class Mutex;
class MemTableIterator;
class MergeContext;
class InternalIterator;

struct ImmutableMemTableOptions {
explicit ImmutableMemTableOptions(const ImmutableCFOptions& ioptions,
Expand Down
1 change: 0 additions & 1 deletion db/merge_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ class Iterator;
class Logger;
class MergeOperator;
class Statistics;
class InternalIterator;

class MergeHelper {
public:
Expand Down
4 changes: 2 additions & 2 deletions db/table_cache.cc
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ InternalIterator* TableCache::NewIterator(
if (s.ok()) {
if (options.table_filter &&
!options.table_filter(*table_reader->GetTableProperties())) {
result = NewEmptyInternalIterator(arena);
result = NewEmptyInternalIterator<Slice>(arena);
} else {
result = table_reader->NewIterator(options, prefix_extractor, arena,
skip_filters, for_compaction);
Expand Down Expand Up @@ -279,7 +279,7 @@ InternalIterator* TableCache::NewIterator(
}
if (!s.ok()) {
assert(result == nullptr);
result = NewErrorInternalIterator(s, arena);
result = NewErrorInternalIterator<Slice>(s, arena);
}
return result;
}
Expand Down
1 change: 0 additions & 1 deletion db/table_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ class Arena;
struct FileDescriptor;
class GetContext;
class HistogramImpl;
class InternalIterator;

class TableCache {
public:
Expand Down
4 changes: 2 additions & 2 deletions db/version_edit.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ enum CustomTag : uint32_t {
kTerminate = 1, // The end of customized fields
kNeedCompaction = 2,
// Since Manifest is not entirely currently forward-compatible, and the only
// forward-compatbile part is the CutsomtTag of kNewFile, we currently encode
// forward-compatible part is the CutsomtTag of kNewFile, we currently encode
// kMinLogNumberToKeep as part of a CustomTag as a hack. This should be
// removed when manifest becomes forward-comptabile.
kMinLogNumberToKeepHack = 3,
Expand Down Expand Up @@ -274,7 +274,7 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) {
break;
case kMinLogNumberToKeepHack:
// This is a hack to encode kMinLogNumberToKeep in a
// forward-compatbile fashion.
// forward-compatible fashion.
if (!GetFixed64(&field, &min_log_number_to_keep_)) {
return "deleted log number malformatted";
}
Expand Down
1 change: 0 additions & 1 deletion db/version_set.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ class Writer;
}

class Compaction;
class InternalIterator;
class LogBuffer;
class LookupKey;
class MemTable;
Expand Down
3 changes: 3 additions & 0 deletions include/rocksdb/table_properties.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ struct TablePropertiesNames {
static const std::string kIndexPartitions;
static const std::string kTopLevelIndexSize;
static const std::string kIndexKeyIsUserKey;
static const std::string kIndexValueIsDeltaEncoded;
static const std::string kFilterSize;
static const std::string kRawKeySize;
static const std::string kRawValueSize;
Expand Down Expand Up @@ -139,6 +140,8 @@ struct TableProperties {
// Whether the index key is user key. Otherwise it includes 8 byte of sequence
// number added by internal key format.
uint64_t index_key_is_user_key = 0;
// Whether delta encoding is used to encode the index values.
uint64_t index_value_is_delta_encoded = 0;
// the size of filter block.
uint64_t filter_size = 0;
// total raw key size
Expand Down
Loading