From 8f763bdeab5f22f94935bf2d080a544a61e1268e Mon Sep 17 00:00:00 2001 From: Hui Xiao Date: Mon, 8 May 2023 13:14:28 -0700 Subject: [PATCH] Record and use the tail size to prefetch table tail (#11406) Summary: **Context:** We prefetch the tail part of a SST file (i.e, the blocks after data blocks till the end of the file) during each SST file open in hope to prefetch all the stuff at once ahead of time for later read e.g, footer, meta index, filter/index etc. The existing approach to estimate the tail size to prefetch is through `TailPrefetchStats` heuristics introduced in https://github.com/facebook/rocksdb/pull/4156, which has caused small reads in unlucky case (e.g, small read into the tail buffer during table open in thread 1 under the same BlockBasedTableFactory object can make thread 2's tail prefetching use a small size that it shouldn't) and is hard to debug. Therefore we decide to record the exact tail size and use it directly to prefetch tail of the SST instead of relying heuristics. **Summary:** - Obtain and record in manifest the tail size in `BlockBasedTableBuilder::Finish()` - For backward compatibility, we fall back to TailPrefetchStats and last to simple heuristics that the tail size is a linear portion of the file size - see PR conversation for more. - Make`tail_start_offset` part of the table properties and deduct tail size to record in manifest for external files (e.g, file ingestion, import CF) and db repair (with no access to manifest). Pull Request resolved: https://github.com/facebook/rocksdb/pull/11406 Test Plan: 1. New UT 2. db bench Note: db bench on /tmp/ where direct read is supported is too slow to finish and the default pinning setting in db bench is not helpful to profile # sst read of Get. Therefore I hacked the following to obtain the following comparison. ``` diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index bd5669f0f..791484c1f 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -838,7 +838,7 @@ Status BlockBasedTable::PrefetchTail( &tail_prefetch_size); // Try file system prefetch - if (!file->use_direct_io() && !force_direct_prefetch) { + if (false && !file->use_direct_io() && !force_direct_prefetch) { if (!file->Prefetch(prefetch_off, prefetch_len, ro.rate_limiter_priority) .IsNotSupported()) { prefetch_buffer->reset(new FilePrefetchBuffer( diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index ea40f5fa0..39a0ac385 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -4191,6 +4191,8 @@ class Benchmark { std::shared_ptr(NewCuckooTableFactory(table_options)); } else { BlockBasedTableOptions block_based_options; + block_based_options.metadata_cache_options.partition_pinning = + PinningTier::kAll; block_based_options.checksum = static_cast(FLAGS_checksum_type); if (FLAGS_use_hash_search) { ``` Create DB ``` ./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none ``` ReadRandom ``` ./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none ``` (a) Existing (Use TailPrefetchStats for tail size + use seperate prefetch buffer in PartitionedFilter/IndexReader::CacheDependencies()) ``` rocksdb.table.open.prefetch.tail.hit COUNT : 3395 rocksdb.sst.read.micros P50 : 5.655570 P95 : 9.931396 P99 : 14.845454 P100 : 585.000000 COUNT : 999905 SUM : 6590614 ``` (b) This PR (Record tail size + use the same tail buffer in PartitionedFilter/IndexReader::CacheDependencies()) ``` rocksdb.table.open.prefetch.tail.hit COUNT : 14257 rocksdb.sst.read.micros P50 : 5.173347 P95 : 9.015017 P99 : 12.912610 P100 : 228.000000 COUNT : 998547 SUM : 5976540 ``` As we can see, we increase the prefetch tail hit count and decrease SST read count with this PR 3. Test backward compatibility by stepping through reading with post-PR code on a db generated pre-PR. Reviewed By: pdillinger Differential Revision: D45413346 Pulled By: hx235 fbshipit-source-id: 7d5e36a60a72477218f79905168d688452a4c064 --- HISTORY.md | 3 + db/builder.cc | 1 + db/compaction/compaction_job_test.cc | 2 +- db/compaction/compaction_outputs.cc | 1 + db/compaction/compaction_picker_test.cc | 2 +- db/db_impl/db_impl_compaction_flush.cc | 4 +- db/db_impl/db_impl_experimental.cc | 2 +- db/db_impl/db_impl_open.cc | 19 +- db/db_test2.cc | 82 ------ db/experimental.cc | 2 +- db/external_sst_file_ingestion_job.cc | 12 +- db/flush_job.cc | 3 +- db/import_column_family_job.cc | 12 +- db/repair.cc | 13 +- db/table_cache.cc | 14 +- db/version_builder_test.cc | 260 ++++++++++-------- db/version_edit.cc | 14 + db/version_edit.h | 16 +- db/version_edit_test.cc | 18 +- db/version_set.cc | 2 +- db/version_set_test.cc | 10 +- file/file_prefetch_buffer.h | 2 + file/prefetch_test.cc | 87 ++++++ include/rocksdb/table_properties.h | 5 + .../block_based/block_based_table_builder.cc | 10 + table/block_based/block_based_table_builder.h | 4 + .../block_based/block_based_table_factory.cc | 2 +- table/block_based/block_based_table_factory.h | 3 + table/block_based/block_based_table_reader.cc | 61 ++-- table/block_based/block_based_table_reader.h | 10 +- table/block_based/filter_block.h | 4 +- table/block_based/partitioned_filter_block.cc | 36 +-- table/block_based/partitioned_filter_block.h | 3 +- table/block_based/partitioned_index_reader.cc | 36 +-- table/block_based/partitioned_index_reader.h | 3 +- table/block_fetcher_test.cc | 7 +- table/meta_blocks.cc | 3 + table/table_builder.h | 10 +- table/table_properties.cc | 2 + 39 files changed, 466 insertions(+), 314 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 9b5e6027992..96170b41ba9 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -10,6 +10,9 @@ ### Bug Fixes * Delete an empty WAL file on DB open if the log number is less than the min log number to keep +### Performance Improvements +* Improved the I/O efficiency of prefetching SST metadata by recording more information in the DB manifest. Opening files written with previous versions will still rely on heuristics for how much to prefetch (#11406). + ## 8.2.0 (04/24/2023) ### Public API Changes * `SstFileWriter::DeleteRange()` now returns `Status::InvalidArgument` if the range's end key comes before its start key according to the user comparator. Previously the behavior was undefined. diff --git a/db/builder.cc b/db/builder.cc index eadc315c9aa..a99bb57e8c2 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -291,6 +291,7 @@ Status BuildTable( if (s.ok() && !empty) { uint64_t file_size = builder->FileSize(); meta->fd.file_size = file_size; + meta->tail_size = builder->GetTailSize(); meta->marked_for_compaction = builder->NeedCompact(); assert(meta->fd.GetFileSize() > 0); tp = builder diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc index 3f7966404f7..a7cf65f0131 100644 --- a/db/compaction/compaction_job_test.cc +++ b/db/compaction/compaction_job_test.cc @@ -386,7 +386,7 @@ class CompactionJobTestBase : public testing::Test { kUnknownFileCreationTime, versions_->GetColumnFamilySet()->GetDefault()->NewEpochNumber(), kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, - 0); + 0, 0); mutex_.Lock(); EXPECT_OK(versions_->LogAndApply( diff --git a/db/compaction/compaction_outputs.cc b/db/compaction/compaction_outputs.cc index cf5105e4108..f6e0dbd7d43 100644 --- a/db/compaction/compaction_outputs.cc +++ b/db/compaction/compaction_outputs.cc @@ -43,6 +43,7 @@ Status CompactionOutputs::Finish(const Status& intput_status, const uint64_t current_bytes = builder_->FileSize(); if (s.ok()) { meta->fd.file_size = current_bytes; + meta->tail_size = builder_->GetTailSize(); meta->marked_for_compaction = builder_->NeedCompact(); } current_output().finished = true; diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc index c9cbb09ed37..a5a14ac21a7 100644 --- a/db/compaction/compaction_picker_test.cc +++ b/db/compaction/compaction_picker_test.cc @@ -148,7 +148,7 @@ class CompactionPickerTestBase : public testing::Test { smallest_seq, largest_seq, marked_for_compact, temperature, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, epoch_number, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); f->compensated_file_size = (compensated_file_size != 0) ? compensated_file_size : file_size; f->oldest_ancester_time = oldest_ancestor_time; diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index b8ab5eb7e7a..39a8339199e 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -1758,7 +1758,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { f->marked_for_compaction, f->temperature, f->oldest_blob_file_number, f->oldest_ancester_time, f->file_creation_time, f->epoch_number, f->file_checksum, f->file_checksum_func_name, f->unique_id, - f->compensated_range_deletion_size); + f->compensated_range_deletion_size, f->tail_size); } ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), @@ -3457,7 +3457,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, f->oldest_blob_file_number, f->oldest_ancester_time, f->file_creation_time, f->epoch_number, f->file_checksum, f->file_checksum_func_name, f->unique_id, - f->compensated_range_deletion_size); + f->compensated_range_deletion_size, f->tail_size); ROCKS_LOG_BUFFER( log_buffer, diff --git a/db/db_impl/db_impl_experimental.cc b/db/db_impl/db_impl_experimental.cc index 5e3b7ba6155..8d958ffc17d 100644 --- a/db/db_impl/db_impl_experimental.cc +++ b/db/db_impl/db_impl_experimental.cc @@ -138,7 +138,7 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) { f->oldest_blob_file_number, f->oldest_ancester_time, f->file_creation_time, f->epoch_number, f->file_checksum, f->file_checksum_func_name, f->unique_id, - f->compensated_range_deletion_size); + f->compensated_range_deletion_size, f->tail_size); } status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index a9b42d62ab4..5bad1e12bdf 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -616,7 +616,8 @@ Status DBImpl::Recover( f->oldest_blob_file_number, f->oldest_ancester_time, f->file_creation_time, f->epoch_number, f->file_checksum, f->file_checksum_func_name, - f->unique_id, f->compensated_range_deletion_size); + f->unique_id, f->compensated_range_deletion_size, + f->tail_size); ROCKS_LOG_WARN(immutable_db_options_.info_log, "[%s] Moving #%" PRIu64 " from from_level-%d to from_level-%d %" PRIu64 @@ -1673,14 +1674,14 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, constexpr int level = 0; if (s.ok() && has_output) { - edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(), - meta.fd.GetFileSize(), meta.smallest, meta.largest, - meta.fd.smallest_seqno, meta.fd.largest_seqno, - meta.marked_for_compaction, meta.temperature, - meta.oldest_blob_file_number, meta.oldest_ancester_time, - meta.file_creation_time, meta.epoch_number, - meta.file_checksum, meta.file_checksum_func_name, - meta.unique_id, meta.compensated_range_deletion_size); + edit->AddFile( + level, meta.fd.GetNumber(), meta.fd.GetPathId(), meta.fd.GetFileSize(), + meta.smallest, meta.largest, meta.fd.smallest_seqno, + meta.fd.largest_seqno, meta.marked_for_compaction, meta.temperature, + meta.oldest_blob_file_number, meta.oldest_ancester_time, + meta.file_creation_time, meta.epoch_number, meta.file_checksum, + meta.file_checksum_func_name, meta.unique_id, + meta.compensated_range_deletion_size, meta.tail_size); for (const auto& blob : blob_file_additions) { edit->AddBlobFile(blob); diff --git a/db/db_test2.cc b/db/db_test2.cc index 544d9b299d6..43aea21813e 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -5298,88 +5298,6 @@ TEST_F(DBTest2, DISABLED_IteratorPinnedMemory) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } -TEST_F(DBTest2, TestBBTTailPrefetch) { - std::atomic called(false); - size_t expected_lower_bound = 512 * 1024; - size_t expected_higher_bound = 512 * 1024; - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) { - size_t* prefetch_size = static_cast(arg); - EXPECT_LE(expected_lower_bound, *prefetch_size); - EXPECT_GE(expected_higher_bound, *prefetch_size); - called = true; - }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - - ASSERT_OK(Put("1", "1")); - ASSERT_OK(Put("9", "1")); - ASSERT_OK(Flush()); - - expected_lower_bound = 0; - expected_higher_bound = 8 * 1024; - - ASSERT_OK(Put("1", "1")); - ASSERT_OK(Put("9", "1")); - ASSERT_OK(Flush()); - - ASSERT_OK(Put("1", "1")); - ASSERT_OK(Put("9", "1")); - ASSERT_OK(Flush()); - - // Full compaction to make sure there is no L0 file after the open. - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - - ASSERT_TRUE(called.load()); - called = false; - - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); - - std::atomic first_call(true); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) { - size_t* prefetch_size = static_cast(arg); - if (first_call) { - EXPECT_EQ(4 * 1024, *prefetch_size); - first_call = false; - } else { - EXPECT_GE(4 * 1024, *prefetch_size); - } - called = true; - }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - - Options options = CurrentOptions(); - options.max_file_opening_threads = 1; // one thread - BlockBasedTableOptions table_options; - table_options.cache_index_and_filter_blocks = true; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - options.max_open_files = -1; - Reopen(options); - - ASSERT_OK(Put("1", "1")); - ASSERT_OK(Put("9", "1")); - ASSERT_OK(Flush()); - - ASSERT_OK(Put("1", "1")); - ASSERT_OK(Put("9", "1")); - ASSERT_OK(Flush()); - - ASSERT_TRUE(called.load()); - called = false; - - // Parallel loading SST files - options.max_file_opening_threads = 16; - Reopen(options); - - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - - ASSERT_TRUE(called.load()); - - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); -} - TEST_F(DBTest2, TestGetColumnFamilyHandleUnlocked) { // Setup sync point dependency to reproduce the race condition of // DBImpl::GetColumnFamilyHandleUnlocked diff --git a/db/experimental.cc b/db/experimental.cc index c2dce7fde46..cb5fb317945 100644 --- a/db/experimental.cc +++ b/db/experimental.cc @@ -102,7 +102,7 @@ Status UpdateManifestForFilesState( lf->oldest_blob_file_number, lf->oldest_ancester_time, lf->file_creation_time, lf->epoch_number, lf->file_checksum, lf->file_checksum_func_name, lf->unique_id, - lf->compensated_range_deletion_size); + lf->compensated_range_deletion_size, lf->tail_size); } } } else { diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index ca9b6fb9bbe..f8716e5f487 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -464,6 +464,16 @@ Status ExternalSstFileIngestionJob::Run() { current_time = oldest_ancester_time = static_cast(temp_current_time); } + uint64_t tail_size = 0; + bool contain_no_data_blocks = f.table_properties.num_entries > 0 && + (f.table_properties.num_entries == + f.table_properties.num_range_deletions); + if (f.table_properties.tail_start_offset > 0 || contain_no_data_blocks) { + uint64_t file_size = f.fd.GetFileSize(); + assert(f.table_properties.tail_start_offset <= file_size); + tail_size = file_size - f.table_properties.tail_start_offset; + } + FileMetaData f_metadata( f.fd.GetNumber(), f.fd.GetPathId(), f.fd.GetFileSize(), f.smallest_internal_key, f.largest_internal_key, f.assigned_seqno, @@ -472,7 +482,7 @@ Status ExternalSstFileIngestionJob::Run() { ingestion_options_.ingest_behind ? kReservedEpochNumberForFileIngestedBehind : cfd_->NewEpochNumber(), - f.file_checksum, f.file_checksum_func_name, f.unique_id, 0); + f.file_checksum, f.file_checksum_func_name, f.unique_id, 0, tail_size); f_metadata.temperature = f.file_temperature; edit_.AddFile(f.picked_level, f_metadata); } diff --git a/db/flush_job.cc b/db/flush_job.cc index a3ffc270714..8b152ee3e0a 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -1007,7 +1007,8 @@ Status FlushJob::WriteLevel0Table() { meta_.oldest_blob_file_number, meta_.oldest_ancester_time, meta_.file_creation_time, meta_.epoch_number, meta_.file_checksum, meta_.file_checksum_func_name, - meta_.unique_id, meta_.compensated_range_deletion_size); + meta_.unique_id, meta_.compensated_range_deletion_size, + meta_.tail_size); edit_->SetBlobFileAdditions(std::move(blob_file_additions)); } // Piggyback FlushJobInfo on the first first flushed memtable. diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc index 9a8b48dd054..f76d0ac1b90 100644 --- a/db/import_column_family_job.cc +++ b/db/import_column_family_job.cc @@ -138,6 +138,16 @@ Status ImportColumnFamilyJob::Run() { const auto& f = files_to_import_[i]; const auto& file_metadata = metadata_[i]; + uint64_t tail_size = 0; + bool contain_no_data_blocks = f.table_properties.num_entries > 0 && + (f.table_properties.num_entries == + f.table_properties.num_range_deletions); + if (f.table_properties.tail_start_offset > 0 || contain_no_data_blocks) { + uint64_t file_size = f.fd.GetFileSize(); + assert(f.table_properties.tail_start_offset <= file_size); + tail_size = file_size - f.table_properties.tail_start_offset; + } + VersionEdit dummy_version_edit; dummy_version_edit.AddFile( file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(), @@ -145,7 +155,7 @@ Status ImportColumnFamilyJob::Run() { file_metadata.smallest_seqno, file_metadata.largest_seqno, false, file_metadata.temperature, kInvalidBlobFileNumber, oldest_ancester_time, current_time, file_metadata.epoch_number, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, f.unique_id, 0); + kUnknownFileChecksumFuncName, f.unique_id, 0, tail_size); s = dummy_version_builder.Apply(&dummy_version_edit); } if (s.ok()) { diff --git a/db/repair.cc b/db/repair.cc index 633c348a5c3..67513cacc4c 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -551,6 +551,17 @@ class Repairer { } t->meta.oldest_ancester_time = props->creation_time; } + if (status.ok()) { + uint64_t tail_size = 0; + bool contain_no_data_blocks = + props->num_entries > 0 && + (props->num_entries == props->num_range_deletions); + if (props->tail_start_offset > 0 || contain_no_data_blocks) { + assert(props->tail_start_offset <= file_size); + tail_size = file_size - props->tail_start_offset; + } + t->meta.tail_size = tail_size; + } ColumnFamilyData* cfd = nullptr; if (status.ok()) { cfd = vset_.GetColumnFamilySet()->GetColumnFamily(t->column_family_id); @@ -682,7 +693,7 @@ class Repairer { table->meta.oldest_ancester_time, table->meta.file_creation_time, table->meta.epoch_number, table->meta.file_checksum, table->meta.file_checksum_func_name, table->meta.unique_id, - table->meta.compensated_range_deletion_size); + table->meta.compensated_range_deletion_size, table->meta.tail_size); } s = dummy_version_builder.Apply(&dummy_edit); if (s.ok()) { diff --git a/db/table_cache.cc b/db/table_cache.cc index c288ec8c7fd..73fa07c6d54 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -140,13 +140,13 @@ Status TableCache::GetTableReader( } s = ioptions_.table_factory->NewTableReader( ro, - TableReaderOptions(ioptions_, prefix_extractor, file_options, - internal_comparator, block_protection_bytes_per_key, - skip_filters, immortal_tables_, - false /* force_direct_prefetch */, level, - block_cache_tracer_, max_file_size_for_l0_meta_pin, - db_session_id_, file_meta.fd.GetNumber(), - expected_unique_id, file_meta.fd.largest_seqno), + TableReaderOptions( + ioptions_, prefix_extractor, file_options, internal_comparator, + block_protection_bytes_per_key, skip_filters, immortal_tables_, + false /* force_direct_prefetch */, level, block_cache_tracer_, + max_file_size_for_l0_meta_pin, db_session_id_, + file_meta.fd.GetNumber(), expected_unique_id, + file_meta.fd.largest_seqno, file_meta.tail_size), std::move(file_reader), file_meta.fd.GetFileSize(), table_reader, prefetch_index_and_filter_in_cache); TEST_SYNC_POINT("TableCache::GetTableReader:0"); diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc index 611dee774b0..af41f00f6c8 100644 --- a/db/version_builder_test.cc +++ b/db/version_builder_test.cc @@ -73,7 +73,7 @@ class VersionBuilderTest : public testing::Test { /* marked_for_compact */ false, Temperature::kUnknown, oldest_blob_file_number, kUnknownOldestAncesterTime, kUnknownFileCreationTime, epoch_number, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); f->compensated_file_size = file_size; f->num_entries = num_entries; f->num_deletions = num_deletions; @@ -136,7 +136,7 @@ class VersionBuilderTest : public testing::Test { Temperature::kUnknown, blob_file_number, kUnknownOldestAncesterTime, kUnknownFileCreationTime, epoch_number, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); } void UpdateVersionStorageInfo(VersionStorageInfo* vstorage) { @@ -183,11 +183,12 @@ TEST_F(VersionBuilderTest, ApplyAndSaveTo) { UpdateVersionStorageInfo(); VersionEdit version_edit; - version_edit.AddFile( - 2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"), + GetInternalKey("350"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); version_edit.DeleteFile(3, 27U); EnvOptions env_options; @@ -230,11 +231,12 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) { UpdateVersionStorageInfo(); VersionEdit version_edit; - version_edit.AddFile( - 3, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + version_edit.AddFile(3, 666, 0, 100U, GetInternalKey("301"), + GetInternalKey("350"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); version_edit.DeleteFile(0, 1U); version_edit.DeleteFile(0, 88U); @@ -281,11 +283,12 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) { UpdateVersionStorageInfo(); VersionEdit version_edit; - version_edit.AddFile( - 4, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + version_edit.AddFile(4, 666, 0, 100U, GetInternalKey("301"), + GetInternalKey("350"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); version_edit.DeleteFile(0, 1U); version_edit.DeleteFile(0, 88U); version_edit.DeleteFile(4, 6U); @@ -317,31 +320,36 @@ TEST_F(VersionBuilderTest, ApplyMultipleAndSaveTo) { UpdateVersionStorageInfo(); VersionEdit version_edit; - version_edit.AddFile( - 2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); - version_edit.AddFile( - 2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); - version_edit.AddFile( - 2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); - version_edit.AddFile( - 2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); - version_edit.AddFile( - 2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"), + GetInternalKey("350"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); + version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"), + GetInternalKey("450"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); + version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"), + GetInternalKey("650"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); + version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"), + GetInternalKey("550"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); + version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"), + GetInternalKey("750"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); EnvOptions env_options; constexpr TableCache* table_cache = nullptr; @@ -376,46 +384,53 @@ TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) { kCompactionStyleLevel, nullptr, false); VersionEdit version_edit; - version_edit.AddFile( - 2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); - version_edit.AddFile( - 2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); - version_edit.AddFile( - 2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); - version_edit.AddFile( - 2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); - version_edit.AddFile( - 2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"), + GetInternalKey("350"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); + version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"), + GetInternalKey("450"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); + version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"), + GetInternalKey("650"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); + version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"), + GetInternalKey("550"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); + version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"), + GetInternalKey("750"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); ASSERT_OK(version_builder.Apply(&version_edit)); VersionEdit version_edit2; - version_edit.AddFile( - 2, 808, 0, 100U, GetInternalKey("901"), GetInternalKey("950"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + version_edit.AddFile(2, 808, 0, 100U, GetInternalKey("901"), + GetInternalKey("950"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); version_edit2.DeleteFile(2, 616); version_edit2.DeleteFile(2, 636); - version_edit.AddFile( - 2, 806, 0, 100U, GetInternalKey("801"), GetInternalKey("850"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + version_edit.AddFile(2, 806, 0, 100U, GetInternalKey("801"), + GetInternalKey("850"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); ASSERT_OK(version_builder.Apply(&version_edit2)); ASSERT_OK(version_builder.SaveTo(&new_vstorage)); @@ -520,13 +535,14 @@ TEST_F(VersionBuilderTest, ApplyFileDeletionAndAddition) { constexpr bool marked_for_compaction = false; - addition.AddFile( - level, file_number, path_id, file_size, - GetInternalKey(smallest, smallest_seq), - GetInternalKey(largest, largest_seq), smallest_seqno, largest_seqno, - marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + addition.AddFile(level, file_number, path_id, file_size, + GetInternalKey(smallest, smallest_seq), + GetInternalKey(largest, largest_seq), smallest_seqno, + largest_seqno, marked_for_compaction, Temperature::kUnknown, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2, 0, 0); ASSERT_OK(builder.Apply(&addition)); @@ -570,12 +586,13 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyInBase) { constexpr SequenceNumber largest_seqno = 1000; constexpr bool marked_for_compaction = false; - edit.AddFile( - new_level, file_number, path_id, file_size, GetInternalKey(smallest), - GetInternalKey(largest), smallest_seqno, largest_seqno, - marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + edit.AddFile(new_level, file_number, path_id, file_size, + GetInternalKey(smallest), GetInternalKey(largest), + smallest_seqno, largest_seqno, marked_for_compaction, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); const Status s = builder.Apply(&edit); ASSERT_TRUE(s.IsCorruption()); @@ -606,12 +623,13 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyApplied) { constexpr SequenceNumber largest_seqno = 1000; constexpr bool marked_for_compaction = false; - edit.AddFile( - level, file_number, path_id, file_size, GetInternalKey(smallest), - GetInternalKey(largest), smallest_seqno, largest_seqno, - marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + edit.AddFile(level, file_number, path_id, file_size, GetInternalKey(smallest), + GetInternalKey(largest), smallest_seqno, largest_seqno, + marked_for_compaction, Temperature::kUnknown, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2, 0, 0); ASSERT_OK(builder.Apply(&edit)); @@ -619,12 +637,13 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyApplied) { constexpr int new_level = 2; - other_edit.AddFile( - new_level, file_number, path_id, file_size, GetInternalKey(smallest), - GetInternalKey(largest), smallest_seqno, largest_seqno, - marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + other_edit.AddFile(new_level, file_number, path_id, file_size, + GetInternalKey(smallest), GetInternalKey(largest), + smallest_seqno, largest_seqno, marked_for_compaction, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); const Status s = builder.Apply(&other_edit); ASSERT_TRUE(s.IsCorruption()); @@ -655,12 +674,13 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAndDeletion) { VersionEdit addition; - addition.AddFile( - level, file_number, path_id, file_size, GetInternalKey(smallest), - GetInternalKey(largest), smallest_seqno, largest_seqno, - marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + addition.AddFile(level, file_number, path_id, file_size, + GetInternalKey(smallest), GetInternalKey(largest), + smallest_seqno, largest_seqno, marked_for_compaction, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); ASSERT_OK(builder.Apply(&addition)); @@ -1233,7 +1253,7 @@ TEST_F(VersionBuilderTest, SaveBlobFilesToConcurrentJobs) { GetInternalKey(largest), smallest_seqno, largest_seqno, marked_for_compaction, Temperature::kUnknown, blob_file_number, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 2 /*epoch_number*/, - checksum_value, checksum_method, kNullUniqueId64x2, 0); + checksum_value, checksum_method, kNullUniqueId64x2, 0, 0); edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes, checksum_method, checksum_value); @@ -1321,7 +1341,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForBlobFiles) { /* oldest_blob_file_number */ 16, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); edit.AddFile(/* level */ 1, /* file_number */ 700, /* path_id */ 0, /* file_size */ 100, /* smallest */ GetInternalKey("801"), @@ -1331,7 +1351,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForBlobFiles) { /* oldest_blob_file_number */ 1000, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); edit.AddBlobFile(/* blob_file_number */ 1000, /* total_blob_count */ 2000, /* total_blob_bytes */ 200000, /* checksum_method */ std::string(), @@ -1552,7 +1572,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { Temperature::kUnknown, /* oldest_blob_file_number */ 1, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); // Add an SST that does not reference any blob files. edit.AddFile( @@ -1562,7 +1582,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { /* largest_seqno */ 2200, /* marked_for_compaction */ false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); // Delete a file that references a blob file. edit.DeleteFile(/* level */ 1, /* file_number */ 6); @@ -1585,7 +1605,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { /* oldest_blob_file_number */ 3, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); // Trivially move a file that does not reference any blob files. edit.DeleteFile(/* level */ 1, /* file_number */ 13); @@ -1597,7 +1617,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); // Add one more SST file that references a blob file, then promptly // delete it in a second version edit before the new version gets saved. @@ -1611,7 +1631,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { /* oldest_blob_file_number */ 5, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); VersionEdit edit2; @@ -1712,7 +1732,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForL0FilesSortedByEpochNumber) { /* oldest_blob_file_number */ kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 1 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); version_edit_1.AddFile( /* level */ 0, /* file_number */ 2U, /* path_id */ 0, /* file_size */ 100, /* smallest */ GetInternalKey("b", 2), @@ -1722,7 +1742,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForL0FilesSortedByEpochNumber) { /* oldest_blob_file_number */ kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 1 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); VersionBuilder version_builder_1(EnvOptions(), &ioptions_, nullptr /* table_cache */, &vstorage_, @@ -1749,7 +1769,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForL0FilesSortedByEpochNumber) { /* oldest_blob_file_number */ kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 1 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); version_edit_2.AddFile( /* level */ 0, /* file_number */ 2U, /* path_id */ 0, /* file_size */ 100, /* smallest */ GetInternalKey("b", 2), @@ -1759,7 +1779,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForL0FilesSortedByEpochNumber) { /* oldest_blob_file_number */ kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 2 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); VersionBuilder version_builder_2(EnvOptions(), &ioptions_, nullptr /* table_cache */, &vstorage_, diff --git a/db/version_edit.cc b/db/version_edit.cc index ecddaa49ccd..4f1ae80d21e 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -238,6 +238,12 @@ bool VersionEdit::EncodeTo(std::string* dst) const { f.compensated_range_deletion_size); PutLengthPrefixedSlice(dst, Slice(compensated_range_deletion_size)); } + if (f.tail_size) { + PutVarint32(dst, NewFileCustomTag::kTailSize); + std::string varint_tail_size; + PutVarint64(&varint_tail_size, f.tail_size); + PutLengthPrefixedSlice(dst, Slice(varint_tail_size)); + } TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields", dst); @@ -416,6 +422,11 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) { return "Invalid compensated range deletion size"; } break; + case kTailSize: + if (!GetVarint64(&field, &f.tail_size)) { + return "invalid tail start offset"; + } + break; default: if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) { // Should not proceed if cannot understand it @@ -851,6 +862,8 @@ std::string VersionEdit::DebugString(bool hex_key) const { InternalUniqueIdToExternal(&id); r.append(UniqueIdToHumanString(EncodeUniqueIdBytes(&id))); } + r.append(" tail size:"); + AppendNumberTo(&r, f.tail_size); } for (const auto& blob_file_addition : blob_file_additions_) { @@ -966,6 +979,7 @@ std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const { // permanent jw << "Temperature" << static_cast(f.temperature); } + jw << "TailSize" << f.tail_size; jw.EndArrayedObject(); } diff --git a/db/version_edit.h b/db/version_edit.h index 65c7fc43ab7..07e8f37742c 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -90,6 +90,7 @@ enum NewFileCustomTag : uint32_t { kUniqueId = 12, kEpochNumber = 13, kCompensatedRangeDeletionSize = 14, + kTailSize = 15, // If this bit for the custom tag is set, opening DB should fail if // we don't know this field. @@ -238,6 +239,10 @@ struct FileMetaData { // SST unique id UniqueId64x2 unique_id{}; + // Size of the "tail" part of a SST file + // "Tail" refers to all blocks after data blocks till the end of the SST file + uint64_t tail_size = 0; + FileMetaData() = default; FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size, @@ -249,7 +254,8 @@ struct FileMetaData { uint64_t _epoch_number, const std::string& _file_checksum, const std::string& _file_checksum_func_name, UniqueId64x2 _unique_id, - const uint64_t _compensated_range_deletion_size) + const uint64_t _compensated_range_deletion_size, + uint64_t _tail_size) : fd(file, file_path_id, file_size, smallest_seq, largest_seq), smallest(smallest_key), largest(largest_key), @@ -262,7 +268,8 @@ struct FileMetaData { epoch_number(_epoch_number), file_checksum(_file_checksum), file_checksum_func_name(_file_checksum_func_name), - unique_id(std::move(_unique_id)) { + unique_id(std::move(_unique_id)), + tail_size(_tail_size) { TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this); } @@ -446,7 +453,8 @@ class VersionEdit { uint64_t epoch_number, const std::string& file_checksum, const std::string& file_checksum_func_name, const UniqueId64x2& unique_id, - const uint64_t compensated_range_deletion_size) { + const uint64_t compensated_range_deletion_size, + uint64_t tail_size) { assert(smallest_seqno <= largest_seqno); new_files_.emplace_back( level, @@ -455,7 +463,7 @@ class VersionEdit { temperature, oldest_blob_file_number, oldest_ancester_time, file_creation_time, epoch_number, file_checksum, file_checksum_func_name, unique_id, - compensated_range_deletion_size)); + compensated_range_deletion_size, tail_size)); if (!HasLastSequence() || largest_seqno > GetLastSequence()) { SetLastSequence(largest_seqno); } diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc index 1fa6c005497..da1a85999c1 100644 --- a/db/version_edit_test.cc +++ b/db/version_edit_test.cc @@ -45,7 +45,7 @@ TEST_F(VersionEditTest, EncodeDecode) { kBig + 500 + i, kBig + 600 + i, false, Temperature::kUnknown, kInvalidBlobFileNumber, 888, 678, kBig + 300 + i /* epoch_number */, "234", "crc32c", - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); edit.DeleteFile(4, kBig + 700 + i); } @@ -65,24 +65,24 @@ TEST_F(VersionEditTest, EncodeDecodeNewFile4) { kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 300 /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue), InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501, kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 301 /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue), InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502, kBig + 602, true, Temperature::kUnknown, kInvalidBlobFileNumber, 666, 888, 302 /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); edit.AddFile(5, 303, 0, 100, InternalKey("foo", kBig + 503, kTypeBlobIndex), InternalKey("zoo", kBig + 603, kTypeBlobIndex), kBig + 503, kBig + 603, true, Temperature::kUnknown, 1001, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 303 /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); edit.DeleteFile(4, 700); @@ -123,12 +123,12 @@ TEST_F(VersionEditTest, ForwardCompatibleNewFile4) { kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 300 /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue), InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501, kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber, 686, 868, 301 /* epoch_number */, "234", "crc32c", - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); edit.DeleteFile(4, 700); edit.SetComparatorName("foo"); @@ -177,7 +177,7 @@ TEST_F(VersionEditTest, NewFile4NotSupportedField) { kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 300 /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); edit.SetComparatorName("foo"); edit.SetLogNumber(kBig + 100); @@ -208,7 +208,7 @@ TEST_F(VersionEditTest, EncodeEmptyFile) { Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 1 /*epoch_number*/, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); std::string buffer; ASSERT_TRUE(!edit.EncodeTo(&buffer)); } diff --git a/db/version_set.cc b/db/version_set.cc index cfe8d617366..436389b81b1 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -6353,7 +6353,7 @@ Status VersionSet::WriteCurrentStateToManifest( f->oldest_blob_file_number, f->oldest_ancester_time, f->file_creation_time, f->epoch_number, f->file_checksum, f->file_checksum_func_name, f->unique_id, - f->compensated_range_deletion_size); + f->compensated_range_deletion_size, f->tail_size); } } diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 481dd46d90e..c0f6d1340a4 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -51,7 +51,7 @@ class GenerateLevelFilesBriefTest : public testing::Test { largest_seq, /* marked_for_compact */ false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); files_.push_back(f); } @@ -163,7 +163,7 @@ class VersionStorageInfoTestBase : public testing::Test { Temperature::kUnknown, oldest_blob_file_number, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, compensated_range_deletion_size); + kNullUniqueId64x2, compensated_range_deletion_size, 0); vstorage_.AddFile(level, f); } @@ -3296,7 +3296,7 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0, info.epoch_number, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, - 0); + 0, 0); } } @@ -3353,7 +3353,7 @@ TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) { file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey, largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0, file_num /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); added_files.emplace_back(0, meta); } WriteFileAdditionAndDeletionToManifest( @@ -3414,7 +3414,7 @@ TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) { file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey, largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0, file_num /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); added_files.emplace_back(0, meta); } WriteFileAdditionAndDeletionToManifest( diff --git a/file/file_prefetch_buffer.h b/file/file_prefetch_buffer.h index 304d1c3bb26..77286ddcfa4 100644 --- a/file/file_prefetch_buffer.h +++ b/file/file_prefetch_buffer.h @@ -180,6 +180,8 @@ class FilePrefetchBuffer { RecordInHistogram(stats_, PREFETCHED_BYTES_DISCARDED, bytes_discarded); } + bool Enabled() const { return enable_; } + // Load data into the buffer from a file. // reader : the file reader. // offset : the file offset to start reading from. diff --git a/file/prefetch_test.cc b/file/prefetch_test.cc index fb31144498c..053caacd09e 100644 --- a/file/prefetch_test.cc +++ b/file/prefetch_test.cc @@ -236,6 +236,93 @@ TEST_P(PrefetchTest, Basic) { Close(); } +TEST_P(PrefetchTest, BlockBasedTableTailPrefetch) { + const bool support_prefetch = + std::get<0>(GetParam()) && + test::IsPrefetchSupported(env_->GetFileSystem(), dbname_); + // Second param is if directIO is enabled or not + const bool use_direct_io = std::get<1>(GetParam()); + const bool use_file_prefetch_buffer = !support_prefetch || use_direct_io; + + std::shared_ptr fs = + std::make_shared(env_->GetFileSystem(), support_prefetch); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + + Options options; + SetGenericOptions(env.get(), use_direct_io, options); + options.statistics = CreateDBStatistics(); + + BlockBasedTableOptions bbto; + bbto.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch; + bbto.partition_filters = true; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + Status s = TryReopen(options); + if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) { + // If direct IO is not supported, skip the test + ROCKSDB_GTEST_BYPASS("Direct IO is not supported"); + return; + } else { + ASSERT_OK(s); + } + + ASSERT_OK(Put("k1", "v1")); + + HistogramData pre_flush_file_read; + options.statistics->histogramData(FILE_READ_FLUSH_MICROS, + &pre_flush_file_read); + ASSERT_OK(Flush()); + HistogramData post_flush_file_read; + options.statistics->histogramData(FILE_READ_FLUSH_MICROS, + &post_flush_file_read); + if (use_file_prefetch_buffer) { + // `PartitionedFilterBlockReader/PartitionIndexReader::CacheDependencies()` + // should read from the prefetched tail in file prefetch buffer instead of + // initiating extra SST reads. Therefore `BlockBasedTable::PrefetchTail()` + // should be the only SST read in table verification during flush. + ASSERT_EQ(post_flush_file_read.count - pre_flush_file_read.count, 1); + } else { + // Without the prefetched tail in file prefetch buffer, + // `PartitionedFilterBlockReader/PartitionIndexReader::CacheDependencies()` + // will initiate extra SST reads + ASSERT_GT(post_flush_file_read.count - pre_flush_file_read.count, 1); + } + ASSERT_OK(Put("k1", "v2")); + ASSERT_OK(Put("k2", "v2")); + ASSERT_OK(Flush()); + + CompactRangeOptions cro; + HistogramData pre_compaction_file_read; + options.statistics->histogramData(FILE_READ_COMPACTION_MICROS, + &pre_compaction_file_read); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + HistogramData post_compaction_file_read; + options.statistics->histogramData(FILE_READ_COMPACTION_MICROS, + &post_compaction_file_read); + if (use_file_prefetch_buffer) { + // `PartitionedFilterBlockReader/PartitionIndexReader::CacheDependencies()` + // should read from the prefetched tail in file prefetch buffer instead of + // initiating extra SST reads. + // + // Therefore the 3 reads are + // (1) `ProcessKeyValueCompaction()` of input file 1 + // (2) `ProcessKeyValueCompaction()` of input file 2 + // (3) `BlockBasedTable::PrefetchTail()` of output file during table + // verification in compaction + ASSERT_EQ(post_compaction_file_read.count - pre_compaction_file_read.count, + 3); + } else { + // Without the prefetched tail in file prefetch buffer, + // `PartitionedFilterBlockReader/PartitionIndexReader::CacheDependencies()` + // as well as reading other parts of the tail (e.g, footer, table + // properties..) will initiate extra SST reads + ASSERT_GT(post_compaction_file_read.count - pre_compaction_file_read.count, + 3); + } + Close(); +} + // This test verifies BlockBasedTableOptions.max_auto_readahead_size is // configured dynamically. TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) { diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h index cbe87fa3af1..ab259f930be 100644 --- a/include/rocksdb/table_properties.h +++ b/include/rocksdb/table_properties.h @@ -70,6 +70,7 @@ struct TablePropertiesNames { static const std::string kSlowCompressionEstimatedDataSize; static const std::string kFastCompressionEstimatedDataSize; static const std::string kSequenceNumberTimeMapping; + static const std::string kTailStartOffset; }; // `TablePropertiesCollector` provides the mechanism for users to collect @@ -239,6 +240,10 @@ struct TableProperties { // 0 means not exists. uint64_t external_sst_file_global_seqno_offset = 0; + // Offset where the "tail" part of SST file starts + // "Tail" refers to all blocks after data blocks till the end of the SST file + uint64_t tail_start_offset = 0; + // DB identity // db_id is an identifier generated the first time the DB is created // If DB identity is unset or unassigned, `db_id` will be an empty string. diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc index 5121d1b43d7..3d7a08e7755 100644 --- a/table/block_based/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -341,6 +341,10 @@ struct BlockBasedTableBuilder::Rep { std::unique_ptr pc_rep; BlockCreateContext create_context; + // The size of the "tail" part of a SST file. "Tail" refers to + // all blocks after data blocks till the end of the SST file. + uint64_t tail_size; + uint64_t get_offset() { return offset.load(std::memory_order_relaxed); } void set_offset(uint64_t o) { offset.store(o, std::memory_order_relaxed); } @@ -456,6 +460,7 @@ struct BlockBasedTableBuilder::Rep { !use_delta_encoding_for_index_values, table_opt.index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey), + tail_size(0), status_ok(true), io_status_ok(true) { if (tbo.target_file_size == 0) { @@ -1908,6 +1913,8 @@ Status BlockBasedTableBuilder::Finish() { } } + r->props.tail_start_offset = r->offset; + // Write meta blocks, metaindex block and footer in the following order. // 1. [meta block: filter] // 2. [meta block: index] @@ -1935,6 +1942,7 @@ Status BlockBasedTableBuilder::Finish() { r->SetStatus(r->CopyIOStatus()); Status ret_status = r->CopyStatus(); assert(!ret_status.ok() || io_status().ok()); + r->tail_size = r->offset - r->props.tail_start_offset; return ret_status; } @@ -1968,6 +1976,8 @@ uint64_t BlockBasedTableBuilder::EstimatedFileSize() const { } } +uint64_t BlockBasedTableBuilder::GetTailSize() const { return rep_->tail_size; } + bool BlockBasedTableBuilder::NeedCompact() const { for (const auto& collector : rep_->table_properties_collectors) { if (collector->NeedCompact()) { diff --git a/table/block_based/block_based_table_builder.h b/table/block_based/block_based_table_builder.h index 32ca30d1b4c..3949474c580 100644 --- a/table/block_based/block_based_table_builder.h +++ b/table/block_based/block_based_table_builder.h @@ -91,6 +91,10 @@ class BlockBasedTableBuilder : public TableBuilder { // is enabled. uint64_t EstimatedFileSize() const override; + // Get the size of the "tail" part of a SST file. "Tail" refers to + // all blocks after data blocks till the end of the SST file. + uint64_t GetTailSize() const override; + bool NeedCompact() const override; // Get table properties diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc index 2bca0703327..653c222d5e4 100644 --- a/table/block_based/block_based_table_factory.cc +++ b/table/block_based/block_based_table_factory.cc @@ -568,7 +568,7 @@ Status BlockBasedTableFactory::NewTableReader( ro, table_reader_options.ioptions, table_reader_options.env_options, table_options_, table_reader_options.internal_comparator, std::move(file), file_size, table_reader_options.block_protection_bytes_per_key, - table_reader, table_reader_cache_res_mgr_, + table_reader, table_reader_options.tail_size, table_reader_cache_res_mgr_, table_reader_options.prefix_extractor, prefetch_index_and_filter_in_cache, table_reader_options.skip_filters, table_reader_options.level, table_reader_options.immortal, table_reader_options.largest_seqno, diff --git a/table/block_based/block_based_table_factory.h b/table/block_based/block_based_table_factory.h index 1bf870ba6f9..1f787697772 100644 --- a/table/block_based/block_based_table_factory.h +++ b/table/block_based/block_based_table_factory.h @@ -28,6 +28,9 @@ class BlockBasedTableBuilder; class RandomAccessFileReader; class WritableFileWriter; +// TODO: deprecate this class as it can be replaced with +// `FileMetaData::tail_size` +// // A class used to track actual bytes written from the tail in the recent SST // file opens, and provide a suggestion for following open. class TailPrefetchStats { diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index b5144166198..a5e8e701467 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -561,7 +561,7 @@ Status BlockBasedTable::Open( const InternalKeyComparator& internal_comparator, std::unique_ptr&& file, uint64_t file_size, uint8_t block_protection_bytes_per_key, - std::unique_ptr* table_reader, + std::unique_ptr* table_reader, uint64_t tail_size, std::shared_ptr table_reader_cache_res_mgr, const std::shared_ptr& prefix_extractor, const bool prefetch_index_and_filter_in_cache, const bool skip_filters, @@ -593,7 +593,8 @@ Status BlockBasedTable::Open( if (!ioptions.allow_mmap_reads) { s = PrefetchTail(ro, file.get(), file_size, force_direct_prefetch, tail_prefetch_stats, prefetch_all, preload_all, - &prefetch_buffer, ioptions.stats); + &prefetch_buffer, ioptions.stats, tail_size, + ioptions.logger); // Return error in prefetch path to users. if (!s.ok()) { return s; @@ -807,21 +808,40 @@ Status BlockBasedTable::PrefetchTail( const ReadOptions& ro, RandomAccessFileReader* file, uint64_t file_size, bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all, const bool preload_all, - std::unique_ptr* prefetch_buffer, Statistics* stats) { + std::unique_ptr* prefetch_buffer, Statistics* stats, + uint64_t tail_size, Logger* const logger) { + assert(tail_size <= file_size); + size_t tail_prefetch_size = 0; - if (tail_prefetch_stats != nullptr) { - // Multiple threads may get a 0 (no history) when running in parallel, - // but it will get cleared after the first of them finishes. - tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize(); - } - if (tail_prefetch_size == 0) { - // Before read footer, readahead backwards to prefetch data. Do more - // readahead if we're going to read index/filter. - // TODO: This may incorrectly select small readahead in case partitioned - // index/filter is enabled and top-level partition pinning is enabled. - // That's because we need to issue readahead before we read the properties, - // at which point we don't yet know the index type. - tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024; + if (tail_size != 0) { + tail_prefetch_size = tail_size; + } else { + if (tail_prefetch_stats != nullptr) { + // Multiple threads may get a 0 (no history) when running in parallel, + // but it will get cleared after the first of them finishes. + tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize(); + } + if (tail_prefetch_size == 0) { + // Before read footer, readahead backwards to prefetch data. Do more + // readahead if we're going to read index/filter. + // TODO: This may incorrectly select small readahead in case partitioned + // index/filter is enabled and top-level partition pinning is enabled. + // That's because we need to issue readahead before we read the + // properties, at which point we don't yet know the index type. + tail_prefetch_size = + prefetch_all || preload_all + ? static_cast(4 * 1024 + 0.01 * file_size) + : 4 * 1024; + + ROCKS_LOG_WARN(logger, + "Tail prefetch size %zu is calculated based on heuristics", + tail_prefetch_size); + } else { + ROCKS_LOG_WARN( + logger, + "Tail prefetch size %zu is calculated based on TailPrefetchStats", + tail_prefetch_size); + } } size_t prefetch_off; size_t prefetch_len; @@ -1140,7 +1160,8 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( // are hence follow the configuration for pin and prefetch regardless of // the value of cache_index_and_filter_blocks if (prefetch_all || pin_partition) { - s = rep_->index_reader->CacheDependencies(ro, pin_partition); + s = rep_->index_reader->CacheDependencies(ro, pin_partition, + prefetch_buffer); } if (!s.ok()) { return s; @@ -1164,7 +1185,7 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( if (filter) { // Refer to the comment above about paritioned indexes always being cached if (prefetch_all || pin_partition) { - s = filter->CacheDependencies(ro, pin_partition); + s = filter->CacheDependencies(ro, pin_partition, prefetch_buffer); if (!s.ok()) { return s; } @@ -1984,8 +2005,8 @@ Status BlockBasedTable::ApproximateKeyAnchors(const ReadOptions& read_options, // `CacheDependencies()` brings all the blocks into cache using one I/O. That // way the full index scan usually finds the index data it is looking for in // cache rather than doing an I/O for each "dependency" (partition). - Status s = - rep_->index_reader->CacheDependencies(read_options, false /* pin */); + Status s = rep_->index_reader->CacheDependencies( + read_options, false /* pin */, nullptr /* prefetch_buffer */); if (!s.ok()) { return s; } diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index dafaa4ebf85..0f14e05dd14 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -99,7 +99,7 @@ class BlockBasedTable : public TableReader { const InternalKeyComparator& internal_key_comparator, std::unique_ptr&& file, uint64_t file_size, uint8_t block_protection_bytes_per_key, - std::unique_ptr* table_reader, + std::unique_ptr* table_reader, uint64_t tail_size, std::shared_ptr table_reader_cache_res_mgr = nullptr, const std::shared_ptr& prefix_extractor = nullptr, @@ -225,8 +225,9 @@ class BlockBasedTable : public TableReader { virtual size_t ApproximateMemoryUsage() const = 0; // Cache the dependencies of the index reader (e.g. the partitions // of a partitioned index). - virtual Status CacheDependencies(const ReadOptions& /*ro*/, - bool /* pin */) { + virtual Status CacheDependencies( + const ReadOptions& /*ro*/, bool /* pin */, + FilePrefetchBuffer* /* tail_prefetch_buffer */) { return Status::OK(); } }; @@ -458,7 +459,8 @@ class BlockBasedTable : public TableReader { const ReadOptions& ro, RandomAccessFileReader* file, uint64_t file_size, bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all, const bool preload_all, - std::unique_ptr* prefetch_buffer, Statistics* stats); + std::unique_ptr* prefetch_buffer, Statistics* stats, + uint64_t tail_size, Logger* const logger); Status ReadMetaIndexBlock(const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, std::unique_ptr* metaindex_block, diff --git a/table/block_based/filter_block.h b/table/block_based/filter_block.h index 957413d1067..b14858c0209 100644 --- a/table/block_based/filter_block.h +++ b/table/block_based/filter_block.h @@ -163,7 +163,9 @@ class FilterBlockReader { return error_msg; } - virtual Status CacheDependencies(const ReadOptions& /*ro*/, bool /*pin*/) { + virtual Status CacheDependencies( + const ReadOptions& /*ro*/, bool /*pin*/, + FilePrefetchBuffer* /* tail_prefetch_buffer */) { return Status::OK(); } diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc index 569b58fef06..9c2b61e8761 100644 --- a/table/block_based/partitioned_filter_block.cc +++ b/table/block_based/partitioned_filter_block.cc @@ -439,8 +439,8 @@ size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const { } // TODO(myabandeh): merge this with the same function in IndexReader -Status PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro, - bool pin) { +Status PartitionedFilterBlockReader::CacheDependencies( + const ReadOptions& ro, bool pin, FilePrefetchBuffer* tail_prefetch_buffer) { assert(table()); const BlockBasedTable::Rep* const rep = table()->get_rep(); @@ -484,21 +484,22 @@ Status PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro, handle.offset() + handle.size() + BlockBasedTable::kBlockTrailerSize; uint64_t prefetch_len = last_off - prefetch_off; std::unique_ptr prefetch_buffer; - rep->CreateFilePrefetchBuffer( - 0, 0, &prefetch_buffer, false /* Implicit autoreadahead */, - 0 /*num_reads_*/, 0 /*num_file_reads_for_auto_readahead*/); - - IOOptions opts; - s = rep->file->PrepareIOOptions(ro, opts); - if (s.ok()) { - s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off, - static_cast(prefetch_len), - ro.rate_limiter_priority); - } - if (!s.ok()) { - return s; + if (tail_prefetch_buffer == nullptr || !tail_prefetch_buffer->Enabled()) { + rep->CreateFilePrefetchBuffer( + 0, 0, &prefetch_buffer, false /* Implicit autoreadahead */, + 0 /*num_reads_*/, 0 /*num_file_reads_for_auto_readahead*/); + + IOOptions opts; + s = rep->file->PrepareIOOptions(ro, opts); + if (s.ok()) { + s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off, + static_cast(prefetch_len), + ro.rate_limiter_priority); + } + if (!s.ok()) { + return s; + } } - // After prefetch, read the partitions one by one for (biter.SeekToFirst(); biter.Valid(); biter.Next()) { handle = biter.value().handle; @@ -507,7 +508,8 @@ Status PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro, // TODO: Support counter batch update for partitioned index and // filter blocks s = table()->MaybeReadBlockAndLoadToCache( - prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(), + prefetch_buffer ? prefetch_buffer.get() : tail_prefetch_buffer, ro, + handle, UncompressionDict::GetEmptyDict(), /* for_compaction */ false, &block, nullptr /* get_context */, &lookup_context, nullptr /* contents */, false); if (!s.ok()) { diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h index 9f7a7be7b91..0e3b4a5b34d 100644 --- a/table/block_based/partitioned_filter_block.h +++ b/table/block_based/partitioned_filter_block.h @@ -166,7 +166,8 @@ class PartitionedFilterBlockReader BlockCacheLookupContext* lookup_context, const ReadOptions& read_options, FilterManyFunction filter_function) const; - Status CacheDependencies(const ReadOptions& ro, bool pin) override; + Status CacheDependencies(const ReadOptions& ro, bool pin, + FilePrefetchBuffer* tail_prefetch_buffer) override; const InternalKeyComparator* internal_comparator() const; bool index_key_includes_seq() const; diff --git a/table/block_based/partitioned_index_reader.cc b/table/block_based/partitioned_index_reader.cc index d1cc88834ea..f322cc910d7 100644 --- a/table/block_based/partitioned_index_reader.cc +++ b/table/block_based/partitioned_index_reader.cc @@ -112,8 +112,8 @@ InternalIteratorBase* PartitionIndexReader::NewIterator( // the first level iter is always on heap and will attempt to delete it // in its destructor. } -Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro, - bool pin) { +Status PartitionIndexReader::CacheDependencies( + const ReadOptions& ro, bool pin, FilePrefetchBuffer* tail_prefetch_buffer) { if (!partition_map_.empty()) { // The dependencies are already cached since `partition_map_` is filled in // an all-or-nothing manner. @@ -162,22 +162,23 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro, handle.offset() + BlockBasedTable::BlockSizeWithTrailer(handle); uint64_t prefetch_len = last_off - prefetch_off; std::unique_ptr prefetch_buffer; - rep->CreateFilePrefetchBuffer( - 0, 0, &prefetch_buffer, false /*Implicit auto readahead*/, - 0 /*num_reads_*/, 0 /*num_file_reads_for_auto_readahead*/); - IOOptions opts; - { - Status s = rep->file->PrepareIOOptions(ro, opts); - if (s.ok()) { - s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off, - static_cast(prefetch_len), - ro.rate_limiter_priority); - } - if (!s.ok()) { - return s; + if (tail_prefetch_buffer == nullptr || !tail_prefetch_buffer->Enabled()) { + rep->CreateFilePrefetchBuffer( + 0, 0, &prefetch_buffer, false /*Implicit auto readahead*/, + 0 /*num_reads_*/, 0 /*num_file_reads_for_auto_readahead*/); + IOOptions opts; + { + Status s = rep->file->PrepareIOOptions(ro, opts); + if (s.ok()) { + s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off, + static_cast(prefetch_len), + ro.rate_limiter_priority); + } + if (!s.ok()) { + return s; + } } } - // For saving "all or nothing" to partition_map_ UnorderedMap> map_in_progress; @@ -191,7 +192,8 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro, // TODO: Support counter batch update for partitioned index and // filter blocks Status s = table()->MaybeReadBlockAndLoadToCache( - prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(), + prefetch_buffer ? prefetch_buffer.get() : tail_prefetch_buffer, ro, + handle, UncompressionDict::GetEmptyDict(), /*for_compaction=*/false, &block.As(), /*get_context=*/nullptr, &lookup_context, /*contents=*/nullptr, /*async_read=*/false); diff --git a/table/block_based/partitioned_index_reader.h b/table/block_based/partitioned_index_reader.h index 58a7877ab5d..9482fd6b44c 100644 --- a/table/block_based/partitioned_index_reader.h +++ b/table/block_based/partitioned_index_reader.h @@ -30,7 +30,8 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { IndexBlockIter* iter, GetContext* get_context, BlockCacheLookupContext* lookup_context) override; - Status CacheDependencies(const ReadOptions& ro, bool pin) override; + Status CacheDependencies(const ReadOptions& ro, bool pin, + FilePrefetchBuffer* tail_prefetch_buffer) override; size_t ApproximateMemoryUsage() const override { size_t usage = ApproximateIndexBlockMemoryUsage(); #ifdef ROCKSDB_MALLOC_USABLE_SIZE diff --git a/table/block_fetcher_test.cc b/table/block_fetcher_test.cc index 9696a509dce..c2f6552cc4b 100644 --- a/table/block_fetcher_test.cc +++ b/table/block_fetcher_test.cc @@ -266,9 +266,10 @@ class BlockFetcherTest : public testing::Test { const auto* table_options = table_factory_.GetOptions(); ASSERT_NE(table_options, nullptr); - ASSERT_OK(BlockBasedTable::Open( - ro, ioptions, EnvOptions(), *table_options, comparator, std::move(file), - file_size, 0 /* block_protection_bytes_per_key */, &table_reader)); + ASSERT_OK(BlockBasedTable::Open(ro, ioptions, EnvOptions(), *table_options, + comparator, std::move(file), file_size, + 0 /* block_protection_bytes_per_key */, + &table_reader, 0 /* tail_size */)); table->reset(reinterpret_cast(table_reader.release())); } diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index 2c58ff9c7d9..6fea536d624 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -115,6 +115,7 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) { Add(TablePropertiesNames::kFastCompressionEstimatedDataSize, props.fast_compression_estimated_data_size); } + Add(TablePropertiesNames::kTailStartOffset, props.tail_start_offset); if (!props.db_id.empty()) { Add(TablePropertiesNames::kDbId, props.db_id); } @@ -307,6 +308,8 @@ Status ReadTablePropertiesHelper( &new_table_properties->slow_compression_estimated_data_size}, {TablePropertiesNames::kFastCompressionEstimatedDataSize, &new_table_properties->fast_compression_estimated_data_size}, + {TablePropertiesNames::kTailStartOffset, + &new_table_properties->tail_start_offset}, }; std::string last_key; diff --git a/table/table_builder.h b/table/table_builder.h index e1bb4b55747..6d98bce7078 100644 --- a/table/table_builder.h +++ b/table/table_builder.h @@ -42,7 +42,8 @@ struct TableReaderOptions { int _level = -1, BlockCacheTracer* const _block_cache_tracer = nullptr, size_t _max_file_size_for_l0_meta_pin = 0, const std::string& _cur_db_session_id = "", uint64_t _cur_file_num = 0, - UniqueId64x2 _unique_id = {}, SequenceNumber _largest_seqno = 0) + UniqueId64x2 _unique_id = {}, SequenceNumber _largest_seqno = 0, + uint64_t _tail_size = 0) : ioptions(_ioptions), prefix_extractor(_prefix_extractor), env_options(_env_options), @@ -57,7 +58,8 @@ struct TableReaderOptions { cur_db_session_id(_cur_db_session_id), cur_file_num(_cur_file_num), unique_id(_unique_id), - block_protection_bytes_per_key(_block_protection_bytes_per_key) {} + block_protection_bytes_per_key(_block_protection_bytes_per_key), + tail_size(_tail_size) {} const ImmutableOptions& ioptions; const std::shared_ptr& prefix_extractor; @@ -89,6 +91,8 @@ struct TableReaderOptions { UniqueId64x2 unique_id; uint8_t block_protection_bytes_per_key; + + uint64_t tail_size; }; struct TableBuilderOptions { @@ -200,6 +204,8 @@ class TableBuilder { // is enabled. virtual uint64_t EstimatedFileSize() const { return FileSize(); } + virtual uint64_t GetTailSize() const { return 0; } + // If the user defined table properties collector suggest the file to // be further compacted. virtual bool NeedCompact() const { return false; } diff --git a/table/table_properties.cc b/table/table_properties.cc index b382281f857..819d4da2ad8 100644 --- a/table/table_properties.cc +++ b/table/table_properties.cc @@ -303,6 +303,8 @@ const std::string TablePropertiesNames::kFastCompressionEstimatedDataSize = "rocksdb.sample_for_compression.fast.data.size"; const std::string TablePropertiesNames::kSequenceNumberTimeMapping = "rocksdb.seqno.time.map"; +const std::string TablePropertiesNames::kTailStartOffset = + "rocksdb.tail.start.offset"; #ifndef NDEBUG // WARNING: TEST_SetRandomTableProperties assumes the following layout of