Skip to content

Commit

Permalink
Clarify caching behavior for index and filter partitions (#9068)
Browse files Browse the repository at this point in the history
Summary:
Somewhat confusingly, index and filter partition blocks are
never owned by table readers, even with
cache_index_and_filter_blocks=false. They still go into block cache
(possibly pinned by table reader) if there is a block cache. If no block
cache, they are only loaded transiently on demand.

This PR primarily clarifies the options APIs and some internal code
comments.

Also, this closes a hypothetical data corruption vulnerability where
some but not all index partitions are pinned. I haven't been able to
reproduce a case where it can happen (the failure seems to propagate
to abort table open) but it's worth patching nonetheless.

Fixes facebook/rocksdb#8979

Pull Request resolved: facebook/rocksdb#9068

Test Plan:
existing tests :-/  I could cover the new code using sync
points, but then I'd have to very carefully relax my `assert(false)`

Reviewed By: ajkr

Differential Revision: D31898284

Pulled By: pdillinger

fbshipit-source-id: f2511a7d3a36bc04b627935d8e6cfea6422f98be
  • Loading branch information
pdillinger authored and facebook-github-bot committed Oct 28, 2021
1 parent 82846f4 commit 5bf9a7d
Show file tree
Hide file tree
Showing 6 changed files with 65 additions and 36 deletions.
12 changes: 8 additions & 4 deletions include/rocksdb/table.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,10 @@ struct BlockBasedTableOptions {
// caching as they should now apply to range tombstone and compression
// dictionary meta-blocks, in addition to index and filter meta-blocks.
//
// Indicating if we'd put index/filter blocks to the block cache.
// If not specified, each "table reader" object will pre-load index/filter
// block during table initialization.
// Whether to put index/filter blocks in the block cache. When false,
// each "table reader" object will pre-load index/filter blocks during
// table initialization. Index and filter partition blocks always use
// block cache regardless of this option.
bool cache_index_and_filter_blocks = false;

// If cache_index_and_filter_blocks is enabled, cache index and filter
Expand Down Expand Up @@ -190,6 +191,8 @@ struct BlockBasedTableOptions {
kHashSearch = 0x01,

// A two-level index implementation. Both levels are binary search indexes.
// Second level index blocks ("partitions") use block cache even when
// cache_index_and_filter_blocks=false.
kTwoLevelIndexSearch = 0x02,

// Like kBinarySearch, but index also contains first key of each block.
Expand Down Expand Up @@ -285,7 +288,8 @@ struct BlockBasedTableOptions {
// well.
// TODO(myabandeh): remove the note above once the limitation is lifted
// Use partitioned full filters for each SST file. This option is
// incompatible with block-based filters.
// incompatible with block-based filters. Filter partition blocks use
// block cache even when cache_index_and_filter_blocks=false.
bool partition_filters = false;

// Option to generate Bloom/Ribbon filters that minimize memory
Expand Down
35 changes: 17 additions & 18 deletions table/block_based/block_based_table_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2063,24 +2063,23 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
const BlockHandle& handle) {
// Return a block iterator on the index partition
auto block = block_map_->find(handle.offset());
// This is a possible scenario since block cache might not have had space
// for the partition
if (block != block_map_->end()) {
const Rep* rep = table_->get_rep();
assert(rep);

Statistics* kNullStats = nullptr;
// We don't return pinned data from index blocks, so no need
// to set `block_contents_pinned`.
return block->second.GetValue()->NewIndexIterator(
rep->internal_comparator.user_comparator(),
rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true,
rep->index_has_first_key, rep->index_key_includes_seq,
rep->index_value_is_full);
}
// Create an empty iterator
// TODO(ajkr): this is not the right way to handle an unpinned partition.
return new IndexBlockIter();
// block_map_ must be exhaustive
if (block == block_map_->end()) {
assert(false);
// Signal problem to caller
return nullptr;
}
const Rep* rep = table_->get_rep();
assert(rep);

Statistics* kNullStats = nullptr;
// We don't return pinned data from index blocks, so no need
// to set `block_contents_pinned`.
return block->second.GetValue()->NewIndexIterator(
rep->internal_comparator.user_comparator(),
rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true,
rep->index_has_first_key, rep->index_key_includes_seq,
rep->index_value_is_full);
}

// This will be broken if the user specifies an unusual implementation
Expand Down
2 changes: 2 additions & 0 deletions table/block_based/partitioned_filter_block.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,8 @@ class PartitionedFilterBlockReader : public FilterBlockReaderCommon<Block> {
bool index_value_is_full() const;

protected:
// For partition blocks pinned in cache. Can be a subset of blocks
// in case some fail insertion on attempt to pin.
std::unordered_map<uint64_t, CachableEntry<ParsedFullFilterBlock>>
filter_map_;
};
Expand Down
45 changes: 31 additions & 14 deletions table/block_based/partitioned_index_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -114,10 +114,12 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro,
Statistics* kNullStats = nullptr;

CachableEntry<Block> index_block;
Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */,
&lookup_context, &index_block);
if (!s.ok()) {
return s;
{
Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */,
&lookup_context, &index_block);
if (!s.ok()) {
return s;
}
}

// We don't return pinned data from index blocks, so no need
Expand Down Expand Up @@ -149,23 +151,30 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro,
rep->CreateFilePrefetchBuffer(0, 0, &prefetch_buffer,
false /*Implicit auto readahead*/);
IOOptions opts;
s = rep->file->PrepareIOOptions(ro, opts);
if (s.ok()) {
s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off,
static_cast<size_t>(prefetch_len));
}
if (!s.ok()) {
return s;
{
Status s = rep->file->PrepareIOOptions(ro, opts);
if (s.ok()) {
s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off,
static_cast<size_t>(prefetch_len));
}
if (!s.ok()) {
return s;
}
}

// For saving "all or nothing" to partition_map_
std::unordered_map<uint64_t, CachableEntry<Block>> map_in_progress;

// After prefetch, read the partitions one by one
biter.SeekToFirst();
size_t partition_count = 0;
for (; biter.Valid(); biter.Next()) {
handle = biter.value().handle;
CachableEntry<Block> block;
++partition_count;
// TODO: Support counter batch update for partitioned index and
// filter blocks
s = table()->MaybeReadBlockAndLoadToCache(
Status s = table()->MaybeReadBlockAndLoadToCache(
prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
/*wait=*/true, &block, BlockType::kIndex, /*get_context=*/nullptr,
&lookup_context, /*contents=*/nullptr);
Expand All @@ -174,14 +183,22 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro,
return s;
}
if (block.GetValue() != nullptr) {
// Might need to "pin" some mmap-read blocks (GetOwnValue) if some
// partitions are successfully compressed (cached) and some are not
// compressed (mmap eligible)
if (block.IsCached() || block.GetOwnValue()) {
if (pin) {
partition_map_[handle.offset()] = std::move(block);
map_in_progress[handle.offset()] = std::move(block);
}
}
}
}
return biter.status();
Status s = biter.status();
// Save (pin) them only if everything checks out
if (map_in_progress.size() == partition_count && s.ok()) {
std::swap(partition_map_, map_in_progress);
}
return s;
}

} // namespace ROCKSDB_NAMESPACE
3 changes: 3 additions & 0 deletions table/block_based/partitioned_index_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
CachableEntry<Block>&& index_block)
: IndexReaderCommon(t, std::move(index_block)) {}

// For partition blocks pinned in cache. This is expected to be "all or
// none" so that !partition_map_.empty() can use an iterator expecting
// all partitions to be saved here.
std::unordered_map<uint64_t, CachableEntry<Block>> partition_map_;
};
} // namespace ROCKSDB_NAMESPACE
4 changes: 4 additions & 0 deletions table/two_level_iterator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,10 @@ void TwoLevelIndexIterator::InitDataBlock() {
state_->NewSecondaryIterator(handle);
data_block_handle_ = handle;
SetSecondLevelIterator(iter);
if (iter == nullptr) {
status_ = Status::Corruption("Missing block for partition " +
handle.ToString());
}
}
}
}
Expand Down

0 comments on commit 5bf9a7d

Please sign in to comment.