Skip to content

Commit 0003927

Browse files
committed
[mpt] Add configurable root offsets chunk allocation
Add --root-offsets-chunk-count flag to monad_mpt to make the number of chunks allocated for root offsets configurable (default: 2, must be power of 2). Each chunk holds approximately 16.5M history entries, allowing operators to adjust history depth based on deployment requirements. Previously, this was hardcoded to 2 chunks.
1 parent 3f56f79 commit 0003927

File tree

7 files changed

+133
-35
lines changed

7 files changed

+133
-35
lines changed

category/async/storage_pool.cpp

Lines changed: 37 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,12 @@ size_t storage_pool::device_t::chunks() const
8080
return metadata_->chunks(size_of_file_);
8181
}
8282

83+
size_t storage_pool::device_t::cnv_chunks() const
84+
{
85+
MONAD_ASSERT(!is_zoned_device(), "zonefs support isn't implemented yet");
86+
return metadata_->num_cnv_chunks;
87+
}
88+
8389
std::pair<file_offset_t, file_offset_t> storage_pool::device_t::capacity() const
8490
{
8591
switch (type_) {
@@ -454,6 +460,7 @@ storage_pool::device_t storage_pool::make_device_(
454460
memcpy(metadata_footer->magic, "MND0", 4);
455461
metadata_footer->chunk_capacity =
456462
static_cast<uint32_t>(chunk_capacity);
463+
metadata_footer->num_cnv_chunks = flags.num_cnv_chunks;
457464
MONAD_ASSERT_PRINTF(
458465
::pwrite(
459466
readwritefd,
@@ -504,21 +511,29 @@ void storage_pool::fill_chunks_(creation_flags const &flags)
504511
fnv1a_hash<uint32_t>::add(
505512
hashshouldbe, uint32_t(device.unique_hash_ >> 32));
506513
}
514+
// Backward compatibility: databases created before `num_cnv_chunks` was
515+
// added have this field set to 0. Treat 0 as the legacy default of 3
516+
// chunks.
517+
uint32_t const cnv_chunks_count =
518+
devices_[0].metadata_->num_cnv_chunks == 0
519+
? 3
520+
: devices_[0].metadata_->num_cnv_chunks;
507521
std::vector<size_t> chunks;
508522
size_t total = 0;
509523
chunks.reserve(devices_.size());
510524
for (auto const &device : devices_) {
511525
if (device.is_file() || device.is_block_device()) {
512526
auto const devicechunks = device.chunks();
513527
MONAD_ASSERT_PRINTF(
514-
devicechunks >= 4,
515-
"Device %s has %zu chunks the minimum allowed is four.",
528+
devicechunks >= cnv_chunks_count + 1,
529+
"Device %s has %zu chunks the minimum allowed is %u.",
516530
device.current_path().c_str(),
517-
devicechunks);
531+
devicechunks,
532+
cnv_chunks_count + 1);
518533
MONAD_ASSERT(devicechunks <= std::numeric_limits<uint32_t>::max());
519-
// Take off three for the cnv chunks
520-
chunks.push_back(devicechunks - 3);
521-
total += devicechunks - 3;
534+
// Take off cnv_chunks_count for the cnv chunks
535+
chunks.push_back(devicechunks - cnv_chunks_count);
536+
total += devicechunks - cnv_chunks_count;
522537
fnv1a_hash<uint32_t>::add(
523538
hashshouldbe, static_cast<uint32_t>(devicechunks));
524539
fnv1a_hash<uint32_t>::add(
@@ -560,22 +575,17 @@ void storage_pool::fill_chunks_(creation_flags const &flags)
560575
auto const zone_id = [this](int const chunk_type) {
561576
return static_cast<uint32_t>(chunks_[chunk_type].size());
562577
};
563-
// First three blocks of each device goes to conventional, remainder go to
564-
// sequential
565-
chunks_[cnv].reserve(devices_.size() * 3);
578+
// First cnv_chunks_count blocks of each device goes to conventional,
579+
// remainder go to sequential
580+
chunks_[cnv].reserve(devices_.size() * cnv_chunks_count);
566581
chunks_[seq].reserve(total);
567582
if (flags.interleave_chunks_evenly) {
568-
for (auto &device : devices_) {
569-
chunks_[cnv].emplace_back(
570-
activate_chunk(storage_pool::cnv, device, 0, zone_id(cnv)));
571-
}
572-
for (auto &device : devices_) {
573-
chunks_[cnv].emplace_back(
574-
activate_chunk(storage_pool::cnv, device, 1, zone_id(cnv)));
575-
}
576-
for (auto &device : devices_) {
577-
chunks_[cnv].emplace_back(
578-
activate_chunk(storage_pool::cnv, device, 2, zone_id(cnv)));
583+
for (uint32_t chunk_idx = 0; chunk_idx < cnv_chunks_count;
584+
++chunk_idx) {
585+
for (auto &device : devices_) {
586+
chunks_[cnv].emplace_back(activate_chunk(
587+
storage_pool::cnv, device, chunk_idx, zone_id(cnv)));
588+
}
579589
}
580590
// We now need to evenly spread the sequential chunks such that if
581591
// device A has 20, device B has 10 and device C has 5, the interleaving
@@ -585,7 +595,7 @@ void storage_pool::fill_chunks_(creation_flags const &flags)
585595
for (size_t n = 0; n < chunks.size(); n++) {
586596
chunkratios[n] = double(total) / static_cast<double>(chunks[n]);
587597
chunkcounts[n] = chunkratios[n];
588-
chunks[n] = 3;
598+
chunks[n] = cnv_chunks_count;
589599
}
590600
while (chunks_[seq].size() < chunks_[seq].capacity()) {
591601
for (size_t n = 0; n < chunks.size(); n++) {
@@ -612,19 +622,18 @@ void storage_pool::fill_chunks_(creation_flags const &flags)
612622
}
613623
else {
614624
for (auto &device : devices_) {
615-
chunks_[cnv].emplace_back(
616-
activate_chunk(cnv, device, 0, zone_id(cnv)));
617-
chunks_[cnv].emplace_back(
618-
activate_chunk(cnv, device, 1, zone_id(cnv)));
619-
chunks_[cnv].emplace_back(
620-
activate_chunk(cnv, device, 2, zone_id(cnv)));
625+
for (uint32_t chunk_idx = 0; chunk_idx < cnv_chunks_count;
626+
++chunk_idx) {
627+
chunks_[cnv].emplace_back(
628+
activate_chunk(cnv, device, chunk_idx, zone_id(cnv)));
629+
}
621630
}
622631
for (size_t deviceidx = 0; deviceidx < chunks.size(); deviceidx++) {
623632
for (size_t n = 0; n < chunks[deviceidx]; n++) {
624633
chunks_[seq].emplace_back(activate_chunk(
625634
seq,
626635
devices_[deviceidx],
627-
static_cast<uint32_t>(3 + n),
636+
static_cast<uint32_t>(cnv_chunks_count + n),
628637
zone_id(seq)));
629638
}
630639
}

category/async/storage_pool.hpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,8 @@ class storage_pool
9595
{
9696
// Preceding this is an array of uint32_t of chunk bytes used
9797

98-
uint32_t spare_[13]; // set aside for flags later
98+
uint32_t spare_[12]; // set aside for flags later
99+
uint32_t num_cnv_chunks; // number of cnv chunks per device
99100
uint32_t config_hash; // hash of this configuration
100101
uint32_t chunk_capacity;
101102
uint8_t magic[4]; // "MND0" for v1 of this metadata
@@ -178,6 +179,9 @@ class storage_pool
178179

179180
//! Returns the number of chunks on this device
180181
size_t chunks() const;
182+
183+
//! Returns the number of cnv chunks on this device
184+
size_t cnv_chunks() const;
181185
//! Returns the capacity of the device, and how much of that is
182186
//! currently filled with data, in that order.
183187
std::pair<file_offset_t, file_offset_t> capacity() const;
@@ -321,12 +325,16 @@ class storage_pool
321325
//! happily use any partition you feed it, including the system drive.
322326
uint32_t disable_mismatching_storage_pool_check : 1;
323327

328+
//! Number of conventional chunks to allocate per device. Default is 3.
329+
uint32_t num_cnv_chunks;
330+
324331
constexpr creation_flags()
325332
: chunk_capacity(28)
326333
, interleave_chunks_evenly(false)
327334
, open_read_only(false)
328335
, open_read_only_allow_dirty(false)
329336
, disable_mismatching_storage_pool_check(false)
337+
, num_cnv_chunks(3)
330338
{
331339
}
332340
};

category/mpt/cli_tool_impl.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,7 @@ struct impl_t
393393
std::ostream &cerr;
394394
MONAD_ASYNC_NAMESPACE::storage_pool::creation_flags flags;
395395
uint8_t chunk_capacity = flags.chunk_capacity;
396+
uint32_t root_offsets_chunk_count = 2;
396397
bool allow_dirty = false;
397398
bool no_prompt = false;
398399
bool create_database = false;
@@ -1445,6 +1446,12 @@ opened.
14451446
"set chunk capacity during database creation (default is 28, "
14461447
"1<<28 "
14471448
"= 256Mb, max is 31).");
1449+
cli.add_option(
1450+
"--root-offsets-chunk-count",
1451+
impl.root_offsets_chunk_count,
1452+
"Number of chunks to allocate for storing root offsets. "
1453+
"Must be a power of 2. Default is 2. "
1454+
"Each chunk holds approximately 16.5M history entries.");
14481455
cli.add_flag(
14491456
"--chunk-increasing",
14501457
impl.create_chunk_increasing,
@@ -1478,6 +1485,7 @@ opened.
14781485
impl.flags.open_read_only = true;
14791486
impl.flags.open_read_only_allow_dirty =
14801487
impl.allow_dirty || !impl.archive_database.empty();
1488+
impl.flags.num_cnv_chunks = impl.root_offsets_chunk_count + 1;
14811489
if (!impl.restore_database.empty()) {
14821490
if (!impl.archive_database.empty()) {
14831491
impl.cli_ask_question(

category/mpt/db.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,10 +122,12 @@ AsyncIOContext::AsyncIOContext(ReadOnlyOnDiskDbConfig const &options)
122122

123123
AsyncIOContext::AsyncIOContext(OnDiskDbConfig const &options)
124124
: pool{[&] -> async::storage_pool {
125+
async::storage_pool::creation_flags pool_options;
126+
pool_options.num_cnv_chunks = options.root_offsets_chunk_count + 1;
125127
auto len = options.file_size_db * 1024 * 1024 * 1024 + 24576;
126128
if (options.dbname_paths.empty()) {
127129
return async::storage_pool{
128-
async::use_anonymous_sized_inode_tag{}, len};
130+
async::use_anonymous_sized_inode_tag{}, len, pool_options};
129131
}
130132
// initialize db file on disk
131133
for (auto const &dbname_path : options.dbname_paths) {
@@ -145,7 +147,8 @@ AsyncIOContext::AsyncIOContext(OnDiskDbConfig const &options)
145147
return async::storage_pool{
146148
options.dbname_paths,
147149
options.append ? async::storage_pool::mode::open_existing
148-
: async::storage_pool::mode::truncate};
150+
: async::storage_pool::mode::truncate,
151+
pool_options};
149152
}()}
150153
, read_ring{{options.uring_entries, options.sq_thread_cpu}}
151154
, write_ring{io::RingConfig{options.wr_buffers}}

category/mpt/ondisk_db_config.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,10 @@ struct OnDiskDbConfig
4343
// fixed history length if contains value, otherwise rely on db to adjust
4444
// history length upon disk usage
4545
std::optional<uint64_t> fixed_history_length{std::nullopt};
46+
// Number of chunks to allocate for root offsets when initializing the disk.
47+
// Each chunk can hold 1 << 24 = 16777216 historical entries.
48+
// This field must be power of 2.
49+
uint32_t root_offsets_chunk_count{2};
4650
};
4751

4852
struct ReadOnlyOnDiskDbConfig

category/mpt/test/update_aux_test.cpp

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030
#include <category/mpt/trie.hpp>
3131
#include <category/mpt/util.hpp>
3232

33+
#include <unistd.h>
34+
3335
using namespace std::chrono_literals;
3436

3537
namespace
@@ -197,3 +199,57 @@ TEST(update_aux_test, root_offsets_fast_slow)
197199
"Detected corruption");
198200
}
199201
}
202+
203+
TEST(update_aux_test, configurable_root_offset_chunks)
204+
{
205+
std::filesystem::path const filename{
206+
MONAD_ASYNC_NAMESPACE::working_temporary_directory() /
207+
"monad_update_aux_test_XXXXXX"};
208+
int const fd = ::mkstemp((char *)filename.native().data());
209+
MONAD_ASSERT(fd != -1);
210+
MONAD_ASSERT(-1 != ::ftruncate(fd, 8UL << 30)); // 8GB
211+
212+
monad::io::Ring ring1;
213+
monad::io::Ring ring2;
214+
monad::io::Buffers testbuf =
215+
monad::io::make_buffers_for_segregated_read_write(
216+
ring1,
217+
ring2,
218+
2,
219+
4,
220+
monad::async::AsyncIO::MONAD_IO_BUFFERS_READ_SIZE,
221+
monad::async::AsyncIO::MONAD_IO_BUFFERS_WRITE_SIZE);
222+
monad::async::storage_pool::creation_flags flags;
223+
flags.num_cnv_chunks = 5;
224+
{
225+
// Create storage pool with 5 conventional chunks
226+
monad::async::storage_pool pool(
227+
std::span{&filename, 1},
228+
monad::async::storage_pool::mode::truncate,
229+
flags);
230+
EXPECT_EQ(pool.chunks(monad::async::storage_pool::cnv), 5);
231+
232+
monad::async::AsyncIO testio(pool, testbuf);
233+
monad::mpt::UpdateAux<> aux(testio);
234+
235+
// Verify that exactly 4 chunks were allocated to hold two copies of
236+
// root offsets, since chunk 0 is used for metadata
237+
EXPECT_TRUE(aux.db_metadata()->using_chunks_for_root_offsets);
238+
EXPECT_EQ(aux.db_metadata()->root_offsets.storage_.cnv_chunks_len, 4);
239+
EXPECT_EQ(aux.root_offsets().capacity(), 2ULL << 25);
240+
}
241+
{
242+
// reopen storage_pool
243+
monad::async::storage_pool pool(
244+
std::span{&filename, 1},
245+
monad::async::storage_pool::mode::open_existing,
246+
flags);
247+
EXPECT_EQ(pool.chunks(monad::async::storage_pool::cnv), 5);
248+
monad::async::AsyncIO testio(pool, testbuf);
249+
monad::mpt::UpdateAux<> aux(testio);
250+
EXPECT_TRUE(aux.db_metadata()->using_chunks_for_root_offsets);
251+
EXPECT_EQ(aux.db_metadata()->root_offsets.storage_.cnv_chunks_len, 4);
252+
EXPECT_EQ(aux.root_offsets().capacity(), 2ULL << 25);
253+
}
254+
remove(filename);
255+
}

category/mpt/update_aux.cpp

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -787,6 +787,11 @@ void UpdateAuxImpl::set_io(
787787
start_lifetime_as<chunk_offset_t>(
788788
(chunk_offset_t *)reservation[1]),
789789
db_version_history_storage_bytes / sizeof(chunk_offset_t)};
790+
LOG_INFO(
791+
"Database root offsets ring buffer is configured with {} "
792+
"chunks, can hold up to {} historical entries.",
793+
db_metadata()->root_offsets.storage_.cnv_chunks_len,
794+
root_offsets().capacity());
790795
}
791796
};
792797
if (0 != memcmp(
@@ -830,10 +835,15 @@ void UpdateAuxImpl::set_io(
830835
db_metadata_[0].main->using_chunks_for_root_offsets = true;
831836
db_metadata_[0].main->history_length =
832837
chunk.capacity() / 2 / sizeof(chunk_offset_t);
833-
// Gobble up all remaining cnv chunks
834-
for (uint32_t n = 2;
835-
n < io->storage_pool().chunks(storage_pool::cnv);
836-
n++) {
838+
// Allocate cnv chunks of the first device - 1 for root offsets
839+
auto const root_offsets_chunk_count =
840+
io->storage_pool().devices()[0].cnv_chunks() - 1;
841+
MONAD_ASSERT(
842+
root_offsets_chunk_count > 0 &&
843+
(root_offsets_chunk_count &
844+
(root_offsets_chunk_count - 1)) == 0,
845+
"Number of cnv chunks for root offsets must be a power of two");
846+
for (uint32_t n = 2; n <= root_offsets_chunk_count; n++) {
837847
auto &chunk = io->storage_pool().chunk(storage_pool::cnv, n);
838848
auto fdw = chunk.write_fd(chunk.capacity());
839849
MONAD_ASSERT(

0 commit comments

Comments
 (0)