Skip to content

Commit

Permalink
Add file temperature related counter and bytes stats to and io_stats (f…
Browse files Browse the repository at this point in the history
…acebook#8710)

Summary:
For tiered storage project, we need to know the block read count and read bytes of files with different temperature. Add FileIOByTemperature to IOStatsContext and collect the bytes read and read count from different temperature files through the RandomAccessFileReader.

Pull Request resolved: facebook#8710

Test Plan: make check, add the testing cases

Reviewed By: siying

Differential Revision: D30582400

Pulled By: zhichao-cao

fbshipit-source-id: d83173de594374fc8404af5ce93a6a9be72c7141
  • Loading branch information
zhichao-cao authored and facebook-github-bot committed Oct 7, 2021
1 parent 699f450 commit b632ed0
Show file tree
Hide file tree
Showing 9 changed files with 160 additions and 38 deletions.
37 changes: 37 additions & 0 deletions db/db_test2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "options/options_helper.h"
#include "port/port.h"
#include "port/stack_trace.h"
#include "rocksdb/iostats_context.h"
#include "rocksdb/persistent_cache.h"
#include "rocksdb/trace_record.h"
#include "rocksdb/trace_record_result.h"
Expand Down Expand Up @@ -6522,6 +6523,9 @@ TEST_F(DBTest2, BottommostTemperature) {
ASSERT_OK(Flush());
ASSERT_OK(dbfull()->TEST_WaitForCompact());

get_iostats_context()->Reset();
IOStatsContext* iostats = get_iostats_context();

ColumnFamilyMetaData metadata;
db_->GetColumnFamilyMetaData(&metadata);
ASSERT_EQ(1, metadata.file_count);
Expand All @@ -6530,11 +6534,31 @@ TEST_F(DBTest2, BottommostTemperature) {
ASSERT_EQ(size, 0);
size = GetSstSizeHelper(Temperature::kWarm);
ASSERT_GT(size, 0);
ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0);
ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);

ASSERT_EQ("bar", Get("foo"));

ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 1);
ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0);
ASSERT_GT(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0);
ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0);

// non-bottommost file still has unknown temperature
ASSERT_OK(Put("foo", "bar"));
ASSERT_OK(Put("bar", "bar"));
ASSERT_OK(Flush());
ASSERT_EQ("bar", Get("bar"));
ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 1);
ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0);
ASSERT_GT(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0);
ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0);

db_->GetColumnFamilyMetaData(&metadata);
ASSERT_EQ(2, metadata.file_count);
ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
Expand Down Expand Up @@ -6583,6 +6607,8 @@ TEST_F(DBTest2, BottommostTemperatureUniversal) {
ASSERT_EQ(size, 0);
size = GetSstSizeHelper(Temperature::kHot);
ASSERT_EQ(size, 0);
get_iostats_context()->Reset();
IOStatsContext* iostats = get_iostats_context();

for (int i = 0; i < kTriggerNum; i++) {
ASSERT_OK(Put("foo", "bar"));
Expand All @@ -6600,6 +6626,17 @@ TEST_F(DBTest2, BottommostTemperatureUniversal) {
ASSERT_GT(size, 0);
size = GetSstSizeHelper(Temperature::kWarm);
ASSERT_EQ(size, 0);
ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0);
ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
ASSERT_EQ("bar", Get("foo"));

ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0);
ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0);
ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0);
ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0);

ASSERT_OK(Put("foo", "bar"));
ASSERT_OK(Put("bar", "bar"));
Expand Down
37 changes: 19 additions & 18 deletions db/table_cache.cc
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ Status TableCache::GetTableReader(
std::unique_ptr<TableReader>* table_reader,
const SliceTransform* prefix_extractor, bool skip_filters, int level,
bool prefetch_index_and_filter_in_cache,
size_t max_file_size_for_l0_meta_pin) {
size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) {
std::string fname =
TableFileName(ioptions_.cf_paths, fd.GetNumber(), fd.GetPathId());
std::unique_ptr<FSRandomAccessFile> file;
Expand Down Expand Up @@ -132,7 +132,8 @@ Status TableCache::GetTableReader(
new RandomAccessFileReader(
std::move(file), fname, ioptions_.clock, io_tracer_,
record_read_stats ? ioptions_.stats : nullptr, SST_READ_MICROS,
file_read_hist, ioptions_.rate_limiter.get(), ioptions_.listeners));
file_read_hist, ioptions_.rate_limiter.get(), ioptions_.listeners,
file_temperature));
s = ioptions_.table_factory->NewTableReader(
ro,
TableReaderOptions(
Expand All @@ -154,15 +155,13 @@ void TableCache::EraseHandle(const FileDescriptor& fd, Cache::Handle* handle) {
cache_->Erase(key);
}

Status TableCache::FindTable(const ReadOptions& ro,
const FileOptions& file_options,
const InternalKeyComparator& internal_comparator,
const FileDescriptor& fd, Cache::Handle** handle,
const SliceTransform* prefix_extractor,
const bool no_io, bool record_read_stats,
HistogramImpl* file_read_hist, bool skip_filters,
int level, bool prefetch_index_and_filter_in_cache,
size_t max_file_size_for_l0_meta_pin) {
Status TableCache::FindTable(
const ReadOptions& ro, const FileOptions& file_options,
const InternalKeyComparator& internal_comparator, const FileDescriptor& fd,
Cache::Handle** handle, const SliceTransform* prefix_extractor,
const bool no_io, bool record_read_stats, HistogramImpl* file_read_hist,
bool skip_filters, int level, bool prefetch_index_and_filter_in_cache,
size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) {
PERF_TIMER_GUARD_WITH_CLOCK(find_table_nanos, ioptions_.clock);
uint64_t number = fd.GetNumber();
Slice key = GetSliceForFileNumber(&number);
Expand All @@ -186,7 +185,7 @@ Status TableCache::FindTable(const ReadOptions& ro,
ro, file_options, internal_comparator, fd, false /* sequential mode */,
record_read_stats, file_read_hist, &table_reader, prefix_extractor,
skip_filters, level, prefetch_index_and_filter_in_cache,
max_file_size_for_l0_meta_pin);
max_file_size_for_l0_meta_pin, file_temperature);
if (!s.ok()) {
assert(table_reader == nullptr);
RecordTick(ioptions_.stats, NO_FILE_ERRORS);
Expand Down Expand Up @@ -231,7 +230,7 @@ InternalIterator* TableCache::NewIterator(
options.read_tier == kBlockCacheTier /* no_io */,
!for_compaction /* record_read_stats */, file_read_hist, skip_filters,
level, true /* prefetch_index_and_filter_in_cache */,
max_file_size_for_l0_meta_pin);
max_file_size_for_l0_meta_pin, file_meta.temperature);
if (s.ok()) {
table_reader = GetTableReaderFromHandle(handle);
}
Expand Down Expand Up @@ -431,7 +430,7 @@ Status TableCache::Get(const ReadOptions& options,
options.read_tier == kBlockCacheTier /* no_io */,
true /* record_read_stats */, file_read_hist, skip_filters,
level, true /* prefetch_index_and_filter_in_cache */,
max_file_size_for_l0_meta_pin);
max_file_size_for_l0_meta_pin, file_meta.temperature);
if (s.ok()) {
t = GetTableReaderFromHandle(handle);
}
Expand Down Expand Up @@ -531,10 +530,12 @@ Status TableCache::MultiGet(const ReadOptions& options,
// found in the row cache and thus the range may now be empty
if (s.ok() && !table_range.empty()) {
if (t == nullptr) {
s = FindTable(
options, file_options_, internal_comparator, fd, &handle,
prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */,
true /* record_read_stats */, file_read_hist, skip_filters, level);
s = FindTable(options, file_options_, internal_comparator, fd, &handle,
prefix_extractor,
options.read_tier == kBlockCacheTier /* no_io */,
true /* record_read_stats */, file_read_hist, skip_filters,
level, true /* prefetch_index_and_filter_in_cache */,
0 /*max_file_size_for_l0_meta_pin*/, file_meta.temperature);
TEST_SYNC_POINT_CALLBACK("TableCache::MultiGet:FindTable", &s);
if (s.ok()) {
t = GetTableReaderFromHandle(handle);
Expand Down
6 changes: 4 additions & 2 deletions db/table_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,8 @@ class TableCache {
HistogramImpl* file_read_hist = nullptr,
bool skip_filters = false, int level = -1,
bool prefetch_index_and_filter_in_cache = true,
size_t max_file_size_for_l0_meta_pin = 0);
size_t max_file_size_for_l0_meta_pin = 0,
Temperature file_temperature = Temperature::kUnknown);

// Get TableReader from a cache handle.
TableReader* GetTableReaderFromHandle(Cache::Handle* handle);
Expand Down Expand Up @@ -206,7 +207,8 @@ class TableCache {
const SliceTransform* prefix_extractor = nullptr,
bool skip_filters = false, int level = -1,
bool prefetch_index_and_filter_in_cache = true,
size_t max_file_size_for_l0_meta_pin = 0);
size_t max_file_size_for_l0_meta_pin = 0,
Temperature file_temperature = Temperature::kUnknown);

// Create a key prefix for looking up the row cache. The prefix is of the
// format row_cache_id + fd_number + seq_no. Later, the user key can be
Expand Down
3 changes: 2 additions & 1 deletion db/version_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1017,7 +1017,8 @@ class VersionBuilder::Rep {
&file_meta->table_reader_handle, prefix_extractor, false /*no_io */,
true /* record_read_stats */,
internal_stats->GetFileReadHist(level), false, level,
prefetch_index_and_filter_in_cache, max_file_size_for_l0_meta_pin);
prefetch_index_and_filter_in_cache, max_file_size_for_l0_meta_pin,
file_meta->temperature);
if (file_meta->table_reader_handle != nullptr) {
// Load table_reader
file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle(
Expand Down
45 changes: 45 additions & 0 deletions file/random_access_file_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,46 @@
#include "util/rate_limiter.h"

namespace ROCKSDB_NAMESPACE {
inline void IOStatsAddBytesByTemperature(Temperature file_temperature,
size_t value) {
if (file_temperature == Temperature::kUnknown) {
return;
}
switch (file_temperature) {
case Temperature::kHot:
IOSTATS_ADD(file_io_stats_by_temperature.hot_file_bytes_read, value);
break;
case Temperature::kWarm:
IOSTATS_ADD(file_io_stats_by_temperature.warm_file_bytes_read, value);
break;
case Temperature::kCold:
IOSTATS_ADD(file_io_stats_by_temperature.cold_file_bytes_read, value);
break;
default:
break;
}
}

inline void IOStatsAddCountByTemperature(Temperature file_temperature,
size_t value) {
if (file_temperature == Temperature::kUnknown) {
return;
}
switch (file_temperature) {
case Temperature::kHot:
IOSTATS_ADD(file_io_stats_by_temperature.hot_file_read_count, value);
break;
case Temperature::kWarm:
IOSTATS_ADD(file_io_stats_by_temperature.warm_file_read_count, value);
break;
case Temperature::kCold:
IOSTATS_ADD(file_io_stats_by_temperature.cold_file_read_count, value);
break;
default:
break;
}
}

IOStatus RandomAccessFileReader::Create(
const std::shared_ptr<FileSystem>& fs, const std::string& fname,
const FileOptions& file_opts,
Expand Down Expand Up @@ -182,6 +222,8 @@ IOStatus RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset,
*result = Slice(res_scratch, io_s.ok() ? pos : 0);
}
IOSTATS_ADD(bytes_read, result->size());
IOStatsAddBytesByTemperature(file_temperature_, result->size());
IOStatsAddCountByTemperature(file_temperature_, 1);
SetPerfLevel(prev_perf_level);
}
if (stats_ != nullptr && file_read_hist_ != nullptr) {
Expand Down Expand Up @@ -347,6 +389,9 @@ IOStatus RandomAccessFileReader::MultiRead(const IOOptions& opts,
}
#endif // ROCKSDB_LITE
IOSTATS_ADD(bytes_read, read_reqs[i].result.size());
IOStatsAddBytesByTemperature(file_temperature_,
read_reqs[i].result.size());
IOStatsAddCountByTemperature(file_temperature_, 1);
}
SetPerfLevel(prev_perf_level);
}
Expand Down
7 changes: 5 additions & 2 deletions file/random_access_file_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ class RandomAccessFileReader {
HistogramImpl* file_read_hist_;
RateLimiter* rate_limiter_;
std::vector<std::shared_ptr<EventListener>> listeners_;
Temperature file_temperature_;

public:
explicit RandomAccessFileReader(
Expand All @@ -82,15 +83,17 @@ class RandomAccessFileReader {
Statistics* stats = nullptr, uint32_t hist_type = 0,
HistogramImpl* file_read_hist = nullptr,
RateLimiter* rate_limiter = nullptr,
const std::vector<std::shared_ptr<EventListener>>& listeners = {})
const std::vector<std::shared_ptr<EventListener>>& listeners = {},
Temperature file_temperature = Temperature::kUnknown)
: file_(std::move(raf), io_tracer, _file_name),
file_name_(std::move(_file_name)),
clock_(clock),
stats_(stats),
hist_type_(hist_type),
file_read_hist_(file_read_hist),
rate_limiter_(rate_limiter),
listeners_() {
listeners_(),
file_temperature_(file_temperature) {
#ifndef ROCKSDB_LITE
std::for_each(listeners.begin(), listeners.end(),
[this](const std::shared_ptr<EventListener>& e) {
Expand Down
28 changes: 28 additions & 0 deletions include/rocksdb/iostats_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,32 @@

namespace ROCKSDB_NAMESPACE {

// EXPERIMENTAL: the IO statistics for tiered storage. It matches with each
// item in Temperature class.
struct FileIOByTemperature {
// the number of bytes read to Temperature::kHot file
uint64_t hot_file_bytes_read;
// the number of bytes read to Temperature::kWarm file
uint64_t warm_file_bytes_read;
// the number of bytes read to Temperature::kCold file
uint64_t cold_file_bytes_read;
// total number of reads to Temperature::kHot file
uint64_t hot_file_read_count;
// total number of reads to Temperature::kWarm file
uint64_t warm_file_read_count;
// total number of reads to Temperature::kCold file
uint64_t cold_file_read_count;
// reset all the statistics to 0.
void Reset() {
hot_file_bytes_read = 0;
warm_file_bytes_read = 0;
cold_file_bytes_read = 0;
hot_file_read_count = 0;
warm_file_read_count = 0;
cold_file_read_count = 0;
}
};

struct IOStatsContext {
// reset all io-stats counter to zero
void Reset();
Expand Down Expand Up @@ -48,6 +74,8 @@ struct IOStatsContext {
uint64_t cpu_write_nanos;
// CPU time spent in read() and pread()
uint64_t cpu_read_nanos;

FileIOByTemperature file_io_stats_by_temperature;
};

// If RocksDB is compiled with -DNIOSTATS_CONTEXT, then a pointer to a global,
Expand Down
8 changes: 7 additions & 1 deletion monitoring/iostats_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ void IOStatsContext::Reset() {
logger_nanos = 0;
cpu_write_nanos = 0;
cpu_read_nanos = 0;
file_io_stats_by_temperature.Reset();
#endif //! NIOSTATS_CONTEXT
}

Expand Down Expand Up @@ -66,7 +67,12 @@ std::string IOStatsContext::ToString(bool exclude_zero_counters) const {
IOSTATS_CONTEXT_OUTPUT(logger_nanos);
IOSTATS_CONTEXT_OUTPUT(cpu_write_nanos);
IOSTATS_CONTEXT_OUTPUT(cpu_read_nanos);

IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.hot_file_bytes_read);
IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.warm_file_bytes_read);
IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.cold_file_bytes_read);
IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.hot_file_read_count);
IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.warm_file_read_count);
IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.cold_file_read_count);
std::string str = ss.str();
str.erase(str.find_last_not_of(", ") + 1);
return str;
Expand Down
27 changes: 13 additions & 14 deletions monitoring/perf_context_imp.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,20 +77,19 @@ extern thread_local PerfContext perf_context;
}

// Increase metric value
#define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level) \
if (perf_level >= PerfLevel::kEnableCount && \
perf_context.per_level_perf_context_enabled && \
perf_context.level_to_perf_context) { \
if ((*(perf_context.level_to_perf_context)).find(level) != \
(*(perf_context.level_to_perf_context)).end()) { \
(*(perf_context.level_to_perf_context))[level].metric += value; \
} \
else { \
PerfContextByLevel empty_context; \
(*(perf_context.level_to_perf_context))[level] = empty_context; \
(*(perf_context.level_to_perf_context))[level].metric += value; \
} \
} \
#define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level) \
if (perf_level >= PerfLevel::kEnableCount && \
perf_context.per_level_perf_context_enabled && \
perf_context.level_to_perf_context) { \
if ((*(perf_context.level_to_perf_context)).find(level) != \
(*(perf_context.level_to_perf_context)).end()) { \
(*(perf_context.level_to_perf_context))[level].metric += value; \
} else { \
PerfContextByLevel empty_context; \
(*(perf_context.level_to_perf_context))[level] = empty_context; \
(*(perf_context.level_to_perf_context))[level].metric += value; \
} \
}

#endif

Expand Down

0 comments on commit b632ed0

Please sign in to comment.