Skip to content

Commit

Permalink
Support readahead during compaction for blob files (#9187)
Browse files Browse the repository at this point in the history
Summary:
The patch adds a new BlobDB configuration option `blob_compaction_readahead_size`
that can be used to enable prefetching data from blob files during compaction.
This is important when using storage with higher latencies like HDDs or remote filesystems.
If enabled, prefetching is used for all cases when blobs are read during compaction,
namely garbage collection, compaction filters (when the existing value has to be read from
a blob file), and `Merge` (when the value of the base `Put` is stored in a blob file).

Pull Request resolved: facebook/rocksdb#9187

Test Plan: Ran `make check` and the stress/crash test.

Reviewed By: riversand963

Differential Revision: D32565512

Pulled By: ltamasi

fbshipit-source-id: 87be9cebc3aa01cc227bec6b5f64d827b8164f5d
  • Loading branch information
ltamasi authored and facebook-github-bot committed Nov 20, 2021
1 parent cd4ea67 commit dc5de45
Show file tree
Hide file tree
Showing 48 changed files with 599 additions and 163 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -652,6 +652,7 @@ set(SOURCES
db/blob/blob_log_format.cc
db/blob/blob_log_sequential_reader.cc
db/blob/blob_log_writer.cc
db/blob/prefetch_buffer_collection.cc
db/builder.cc
db/c.cc
db/column_family.cc
Expand Down
2 changes: 2 additions & 0 deletions TARGETS
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ cpp_library(
"db/blob/blob_log_format.cc",
"db/blob/blob_log_sequential_reader.cc",
"db/blob/blob_log_writer.cc",
"db/blob/prefetch_buffer_collection.cc",
"db/builder.cc",
"db/c.cc",
"db/column_family.cc",
Expand Down Expand Up @@ -487,6 +488,7 @@ cpp_library(
"db/blob/blob_log_format.cc",
"db/blob/blob_log_sequential_reader.cc",
"db/blob/blob_log_writer.cc",
"db/blob/prefetch_buffer_collection.cc",
"db/builder.cc",
"db/c.cc",
"db/column_family.cc",
Expand Down
28 changes: 20 additions & 8 deletions db/blob/blob_fetcher.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,26 @@

namespace ROCKSDB_NAMESPACE {

Status BlobFetcher::FetchBlob(const Slice& user_key, const Slice& blob_index,
PinnableSlice* blob_value) {
Status s;
Status BlobFetcher::FetchBlob(const Slice& user_key,
const Slice& blob_index_slice,
FilePrefetchBuffer* prefetch_buffer,
PinnableSlice* blob_value,
uint64_t* bytes_read) const {
assert(version_);
constexpr uint64_t* bytes_read = nullptr;
s = version_->GetBlob(read_options_, user_key, blob_index, blob_value,
bytes_read);
return s;

return version_->GetBlob(read_options_, user_key, blob_index_slice,
prefetch_buffer, blob_value, bytes_read);
}

Status BlobFetcher::FetchBlob(const Slice& user_key,
const BlobIndex& blob_index,
FilePrefetchBuffer* prefetch_buffer,
PinnableSlice* blob_value,
uint64_t* bytes_read) const {
assert(version_);

return version_->GetBlob(read_options_, user_key, blob_index, prefetch_buffer,
blob_value, bytes_read);
}

} // namespace ROCKSDB_NAMESPACE
} // namespace ROCKSDB_NAMESPACE
21 changes: 16 additions & 5 deletions db/blob/blob_fetcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,29 @@
#include "rocksdb/status.h"

namespace ROCKSDB_NAMESPACE {

class Version;
class Slice;
class FilePrefetchBuffer;
class PinnableSlice;
class BlobIndex;

// A thin wrapper around the blob retrieval functionality of Version.
class BlobFetcher {
public:
BlobFetcher(Version* version, const ReadOptions& read_options)
BlobFetcher(const Version* version, const ReadOptions& read_options)
: version_(version), read_options_(read_options) {}

Status FetchBlob(const Slice& user_key, const Slice& blob_index,
PinnableSlice* blob_value);
Status FetchBlob(const Slice& user_key, const Slice& blob_index_slice,
FilePrefetchBuffer* prefetch_buffer,
PinnableSlice* blob_value, uint64_t* bytes_read) const;

Status FetchBlob(const Slice& user_key, const BlobIndex& blob_index,
FilePrefetchBuffer* prefetch_buffer,
PinnableSlice* blob_value, uint64_t* bytes_read) const;

private:
Version* version_;
const Version* version_;
ReadOptions read_options_;
};
} // namespace ROCKSDB_NAMESPACE
} // namespace ROCKSDB_NAMESPACE
24 changes: 20 additions & 4 deletions db/blob/blob_file_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <string>

#include "db/blob/blob_log_format.h"
#include "file/file_prefetch_buffer.h"
#include "file/filename.h"
#include "monitoring/statistics.h"
#include "options/cf_options.h"
Expand Down Expand Up @@ -282,6 +283,7 @@ Status BlobFileReader::GetBlob(const ReadOptions& read_options,
const Slice& user_key, uint64_t offset,
uint64_t value_size,
CompressionType compression_type,
FilePrefetchBuffer* prefetch_buffer,
PinnableSlice* value,
uint64_t* bytes_read) const {
assert(value);
Expand Down Expand Up @@ -313,7 +315,21 @@ Status BlobFileReader::GetBlob(const ReadOptions& read_options,
Buffer buf;
AlignedBuf aligned_buf;

{
bool prefetched = false;

if (prefetch_buffer) {
Status s;
constexpr bool for_compaction = true;

prefetched = prefetch_buffer->TryReadFromCache(
IOOptions(), file_reader_.get(), record_offset,
static_cast<size_t>(record_size), &record_slice, &s, for_compaction);
if (!s.ok()) {
return s;
}
}

if (!prefetched) {
TEST_SYNC_POINT("BlobFileReader::GetBlob:ReadFromFile");

const Status s = ReadFromFile(file_reader_.get(), record_offset,
Expand All @@ -322,11 +338,11 @@ Status BlobFileReader::GetBlob(const ReadOptions& read_options,
if (!s.ok()) {
return s;
}

TEST_SYNC_POINT_CALLBACK("BlobFileReader::GetBlob:TamperWithResult",
&record_slice);
}

TEST_SYNC_POINT_CALLBACK("BlobFileReader::GetBlob:TamperWithResult",
&record_slice);

if (read_options.verify_checksums) {
const Status s = VerifyBlob(record_slice, user_key, value_size);
if (!s.ok()) {
Expand Down
4 changes: 3 additions & 1 deletion db/blob/blob_file_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ struct FileOptions;
class HistogramImpl;
struct ReadOptions;
class Slice;
class FilePrefetchBuffer;
class PinnableSlice;
class Statistics;

Expand All @@ -41,7 +42,8 @@ class BlobFileReader {

Status GetBlob(const ReadOptions& read_options, const Slice& user_key,
uint64_t offset, uint64_t value_size,
CompressionType compression_type, PinnableSlice* value,
CompressionType compression_type,
FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value,
uint64_t* bytes_read) const;

// offsets must be sorted in ascending order by caller.
Expand Down
57 changes: 36 additions & 21 deletions db/blob/blob_file_reader_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -179,13 +179,15 @@ TEST_F(BlobFileReaderTest, CreateReaderAndGetBlob) {
ReadOptions read_options;
read_options.verify_checksums = false;

constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;

{
PinnableSlice value;
uint64_t bytes_read = 0;

ASSERT_OK(reader->GetBlob(read_options, keys[0], blob_offsets[0],
blob_sizes[0], kNoCompression, &value,
&bytes_read));
blob_sizes[0], kNoCompression, prefetch_buffer,
&value, &bytes_read));
ASSERT_EQ(value, blobs[0]);
ASSERT_EQ(bytes_read, blob_sizes[0]);

Expand Down Expand Up @@ -222,8 +224,8 @@ TEST_F(BlobFileReaderTest, CreateReaderAndGetBlob) {
uint64_t bytes_read = 0;

ASSERT_OK(reader->GetBlob(read_options, keys[1], blob_offsets[1],
blob_sizes[1], kNoCompression, &value,
&bytes_read));
blob_sizes[1], kNoCompression, prefetch_buffer,
&value, &bytes_read));
ASSERT_EQ(value, blobs[1]);

const uint64_t key_size = keys[1].size();
Expand All @@ -239,8 +241,8 @@ TEST_F(BlobFileReaderTest, CreateReaderAndGetBlob) {

ASSERT_TRUE(reader
->GetBlob(read_options, keys[0], blob_offsets[0] - 1,
blob_sizes[0], kNoCompression, &value,
&bytes_read)
blob_sizes[0], kNoCompression, prefetch_buffer,
&value, &bytes_read)
.IsCorruption());
ASSERT_EQ(bytes_read, 0);
}
Expand All @@ -252,8 +254,8 @@ TEST_F(BlobFileReaderTest, CreateReaderAndGetBlob) {

ASSERT_TRUE(reader
->GetBlob(read_options, keys[2], blob_offsets[2] + 1,
blob_sizes[2], kNoCompression, &value,
&bytes_read)
blob_sizes[2], kNoCompression, prefetch_buffer,
&value, &bytes_read)
.IsCorruption());
ASSERT_EQ(bytes_read, 0);
}
Expand All @@ -265,7 +267,8 @@ TEST_F(BlobFileReaderTest, CreateReaderAndGetBlob) {

ASSERT_TRUE(reader
->GetBlob(read_options, keys[0], blob_offsets[0],
blob_sizes[0], kZSTD, &value, &bytes_read)
blob_sizes[0], kZSTD, prefetch_buffer, &value,
&bytes_read)
.IsCorruption());
ASSERT_EQ(bytes_read, 0);
}
Expand All @@ -280,8 +283,8 @@ TEST_F(BlobFileReaderTest, CreateReaderAndGetBlob) {
->GetBlob(read_options, shorter_key,
blob_offsets[0] -
(keys[0].size() - sizeof(shorter_key) + 1),
blob_sizes[0], kNoCompression, &value,
&bytes_read)
blob_sizes[0], kNoCompression, prefetch_buffer,
&value, &bytes_read)
.IsCorruption());
ASSERT_EQ(bytes_read, 0);

Expand Down Expand Up @@ -323,8 +326,8 @@ TEST_F(BlobFileReaderTest, CreateReaderAndGetBlob) {

ASSERT_TRUE(reader
->GetBlob(read_options, incorrect_key, blob_offsets[0],
blob_sizes[0], kNoCompression, &value,
&bytes_read)
blob_sizes[0], kNoCompression, prefetch_buffer,
&value, &bytes_read)
.IsCorruption());
ASSERT_EQ(bytes_read, 0);

Expand Down Expand Up @@ -363,8 +366,8 @@ TEST_F(BlobFileReaderTest, CreateReaderAndGetBlob) {

ASSERT_TRUE(reader
->GetBlob(read_options, keys[1], blob_offsets[1],
blob_sizes[1] + 1, kNoCompression, &value,
&bytes_read)
blob_sizes[1] + 1, kNoCompression,
prefetch_buffer, &value, &bytes_read)
.IsCorruption());
ASSERT_EQ(bytes_read, 0);

Expand Down Expand Up @@ -642,12 +645,14 @@ TEST_F(BlobFileReaderTest, BlobCRCError) {

SyncPoint::GetInstance()->EnableProcessing();

constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
PinnableSlice value;
uint64_t bytes_read = 0;

ASSERT_TRUE(reader
->GetBlob(ReadOptions(), key, blob_offset, blob_size,
kNoCompression, &value, &bytes_read)
kNoCompression, prefetch_buffer, &value,
&bytes_read)
.IsCorruption());
ASSERT_EQ(bytes_read, 0);

Expand Down Expand Up @@ -695,12 +700,15 @@ TEST_F(BlobFileReaderTest, Compression) {
ReadOptions read_options;
read_options.verify_checksums = false;

constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;

{
PinnableSlice value;
uint64_t bytes_read = 0;

ASSERT_OK(reader->GetBlob(read_options, key, blob_offset, blob_size,
kSnappyCompression, &value, &bytes_read));
kSnappyCompression, prefetch_buffer, &value,
&bytes_read));
ASSERT_EQ(value, blob);
ASSERT_EQ(bytes_read, blob_size);
}
Expand All @@ -712,7 +720,8 @@ TEST_F(BlobFileReaderTest, Compression) {
uint64_t bytes_read = 0;

ASSERT_OK(reader->GetBlob(read_options, key, blob_offset, blob_size,
kSnappyCompression, &value, &bytes_read));
kSnappyCompression, prefetch_buffer, &value,
&bytes_read));
ASSERT_EQ(value, blob);

constexpr uint64_t key_size = sizeof(key) - 1;
Expand Down Expand Up @@ -770,12 +779,14 @@ TEST_F(BlobFileReaderTest, UncompressionError) {

SyncPoint::GetInstance()->EnableProcessing();

constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
PinnableSlice value;
uint64_t bytes_read = 0;

ASSERT_TRUE(reader
->GetBlob(ReadOptions(), key, blob_offset, blob_size,
kSnappyCompression, &value, &bytes_read)
kSnappyCompression, prefetch_buffer, &value,
&bytes_read)
.IsCorruption());
ASSERT_EQ(bytes_read, 0);

Expand Down Expand Up @@ -854,12 +865,14 @@ TEST_P(BlobFileReaderIOErrorTest, IOError) {
} else {
ASSERT_OK(s);

constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
PinnableSlice value;
uint64_t bytes_read = 0;

ASSERT_TRUE(reader
->GetBlob(ReadOptions(), key, blob_offset, blob_size,
kNoCompression, &value, &bytes_read)
kNoCompression, prefetch_buffer, &value,
&bytes_read)
.IsIOError());
ASSERT_EQ(bytes_read, 0);
}
Expand Down Expand Up @@ -937,12 +950,14 @@ TEST_P(BlobFileReaderDecodingErrorTest, DecodingError) {
} else {
ASSERT_OK(s);

constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
PinnableSlice value;
uint64_t bytes_read = 0;

ASSERT_TRUE(reader
->GetBlob(ReadOptions(), key, blob_offset, blob_size,
kNoCompression, &value, &bytes_read)
kNoCompression, prefetch_buffer, &value,
&bytes_read)
.IsCorruption());
ASSERT_EQ(bytes_read, 0);
}
Expand Down
Loading

0 comments on commit dc5de45

Please sign in to comment.