Skip to content

Commit 97a2d6a

Browse files
committed
[mpt] Add configurable root offsets chunk allocation
Add --root-offsets-chunk-count flag to monad_mpt to make the number of chunks allocated for root offsets configurable (default: 16, must be power of 2). Each chunk holds approximately 16.5M history entries, allowing operators to adjust history depth based on deployment requirements. Previously, this was hardcoded to 2 chunks.
1 parent 5e1629b commit 97a2d6a

File tree

10 files changed

+184
-53
lines changed

10 files changed

+184
-53
lines changed

category/async/storage_pool.cpp

Lines changed: 48 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@
4141

4242
#include <stdlib.h>
4343

44+
#include <quill/Quill.h>
45+
4446
#include <asm-generic/ioctl.h>
4547
#include <fcntl.h>
4648
#include <linux/falloc.h>
@@ -80,6 +82,12 @@ size_t storage_pool::device_t::chunks() const
8082
return metadata_->chunks(size_of_file_);
8183
}
8284

85+
size_t storage_pool::device_t::cnv_chunks() const
86+
{
87+
MONAD_ASSERT(!is_zoned_device(), "zonefs support isn't implemented yet");
88+
return metadata_->num_cnv_chunks;
89+
}
90+
8391
std::pair<file_offset_t, file_offset_t> storage_pool::device_t::capacity() const
8492
{
8593
switch (type_) {
@@ -454,6 +462,7 @@ storage_pool::device_t storage_pool::make_device_(
454462
memcpy(metadata_footer->magic, "MND0", 4);
455463
metadata_footer->chunk_capacity =
456464
static_cast<uint32_t>(chunk_capacity);
465+
metadata_footer->num_cnv_chunks = flags.num_cnv_chunks;
457466
MONAD_ASSERT_PRINTF(
458467
::pwrite(
459468
readwritefd,
@@ -465,6 +474,15 @@ storage_pool::device_t storage_pool::make_device_(
465474
}
466475
total_size =
467476
metadata_footer->total_size(static_cast<size_t>(stat.st_size));
477+
if (flags.num_cnv_chunks > metadata_footer->num_cnv_chunks) {
478+
LOG_WARNING(
479+
"Flag-specified num_cnv_chunks ({}) exceeds the value in "
480+
"metadata ({}) on the existing database. "
481+
"Existing databases cannot be reconfigured to use more chunks. "
482+
"Please create a new database to increase num_cnv_chunks.",
483+
flags.num_cnv_chunks,
484+
metadata_footer->num_cnv_chunks);
485+
}
468486
}
469487
size_t const offset = round_down_align<CPU_PAGE_BITS>(
470488
static_cast<size_t>(stat.st_size) - total_size);
@@ -504,21 +522,29 @@ void storage_pool::fill_chunks_(creation_flags const &flags)
504522
fnv1a_hash<uint32_t>::add(
505523
hashshouldbe, uint32_t(device.unique_hash_ >> 32));
506524
}
525+
// Backward compatibility: databases created before `num_cnv_chunks` was
526+
// added have this field set to 0. Treat 0 as the legacy default of 3
527+
// chunks.
528+
uint32_t const cnv_chunks_count =
529+
devices_[0].metadata_->num_cnv_chunks == 0
530+
? 3
531+
: devices_[0].metadata_->num_cnv_chunks;
507532
std::vector<size_t> chunks;
508533
size_t total = 0;
509534
chunks.reserve(devices_.size());
510535
for (auto const &device : devices_) {
511536
if (device.is_file() || device.is_block_device()) {
512537
auto const devicechunks = device.chunks();
513538
MONAD_ASSERT_PRINTF(
514-
devicechunks >= 4,
515-
"Device %s has %zu chunks the minimum allowed is four.",
539+
devicechunks >= cnv_chunks_count + 1,
540+
"Device %s has %zu chunks the minimum allowed is %u.",
516541
device.current_path().c_str(),
517-
devicechunks);
542+
devicechunks,
543+
cnv_chunks_count + 1);
518544
MONAD_ASSERT(devicechunks <= std::numeric_limits<uint32_t>::max());
519-
// Take off three for the cnv chunks
520-
chunks.push_back(devicechunks - 3);
521-
total += devicechunks - 3;
545+
// Take off cnv_chunks_count for the cnv chunks
546+
chunks.push_back(devicechunks - cnv_chunks_count);
547+
total += devicechunks - cnv_chunks_count;
522548
fnv1a_hash<uint32_t>::add(
523549
hashshouldbe, static_cast<uint32_t>(devicechunks));
524550
fnv1a_hash<uint32_t>::add(
@@ -560,22 +586,17 @@ void storage_pool::fill_chunks_(creation_flags const &flags)
560586
auto const zone_id = [this](int const chunk_type) {
561587
return static_cast<uint32_t>(chunks_[chunk_type].size());
562588
};
563-
// First three blocks of each device goes to conventional, remainder go to
564-
// sequential
565-
chunks_[cnv].reserve(devices_.size() * 3);
589+
// First cnv_chunks_count blocks of each device goes to conventional,
590+
// remainder go to sequential
591+
chunks_[cnv].reserve(devices_.size() * cnv_chunks_count);
566592
chunks_[seq].reserve(total);
567593
if (flags.interleave_chunks_evenly) {
568-
for (auto &device : devices_) {
569-
chunks_[cnv].emplace_back(
570-
activate_chunk(storage_pool::cnv, device, 0, zone_id(cnv)));
571-
}
572-
for (auto &device : devices_) {
573-
chunks_[cnv].emplace_back(
574-
activate_chunk(storage_pool::cnv, device, 1, zone_id(cnv)));
575-
}
576-
for (auto &device : devices_) {
577-
chunks_[cnv].emplace_back(
578-
activate_chunk(storage_pool::cnv, device, 2, zone_id(cnv)));
594+
for (uint32_t chunk_idx = 0; chunk_idx < cnv_chunks_count;
595+
++chunk_idx) {
596+
for (auto &device : devices_) {
597+
chunks_[cnv].emplace_back(activate_chunk(
598+
storage_pool::cnv, device, chunk_idx, zone_id(cnv)));
599+
}
579600
}
580601
// We now need to evenly spread the sequential chunks such that if
581602
// device A has 20, device B has 10 and device C has 5, the interleaving
@@ -585,7 +606,7 @@ void storage_pool::fill_chunks_(creation_flags const &flags)
585606
for (size_t n = 0; n < chunks.size(); n++) {
586607
chunkratios[n] = double(total) / static_cast<double>(chunks[n]);
587608
chunkcounts[n] = chunkratios[n];
588-
chunks[n] = 3;
609+
chunks[n] = cnv_chunks_count;
589610
}
590611
while (chunks_[seq].size() < chunks_[seq].capacity()) {
591612
for (size_t n = 0; n < chunks.size(); n++) {
@@ -612,19 +633,18 @@ void storage_pool::fill_chunks_(creation_flags const &flags)
612633
}
613634
else {
614635
for (auto &device : devices_) {
615-
chunks_[cnv].emplace_back(
616-
activate_chunk(cnv, device, 0, zone_id(cnv)));
617-
chunks_[cnv].emplace_back(
618-
activate_chunk(cnv, device, 1, zone_id(cnv)));
619-
chunks_[cnv].emplace_back(
620-
activate_chunk(cnv, device, 2, zone_id(cnv)));
636+
for (uint32_t chunk_idx = 0; chunk_idx < cnv_chunks_count;
637+
++chunk_idx) {
638+
chunks_[cnv].emplace_back(
639+
activate_chunk(cnv, device, chunk_idx, zone_id(cnv)));
640+
}
621641
}
622642
for (size_t deviceidx = 0; deviceidx < chunks.size(); deviceidx++) {
623643
for (size_t n = 0; n < chunks[deviceidx]; n++) {
624644
chunks_[seq].emplace_back(activate_chunk(
625645
seq,
626646
devices_[deviceidx],
627-
static_cast<uint32_t>(3 + n),
647+
static_cast<uint32_t>(cnv_chunks_count + n),
628648
zone_id(seq)));
629649
}
630650
}

category/async/storage_pool.hpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,8 @@ class storage_pool
9595
{
9696
// Preceding this is an array of uint32_t of chunk bytes used
9797

98-
uint32_t spare_[13]; // set aside for flags later
98+
uint32_t spare_[12]; // set aside for flags later
99+
uint32_t num_cnv_chunks; // number of cnv chunks per device
99100
uint32_t config_hash; // hash of this configuration
100101
uint32_t chunk_capacity;
101102
uint8_t magic[4]; // "MND0" for v1 of this metadata
@@ -178,6 +179,9 @@ class storage_pool
178179

179180
//! Returns the number of chunks on this device
180181
size_t chunks() const;
182+
183+
//! Returns the number of cnv chunks on this device
184+
size_t cnv_chunks() const;
181185
//! Returns the capacity of the device, and how much of that is
182186
//! currently filled with data, in that order.
183187
std::pair<file_offset_t, file_offset_t> capacity() const;
@@ -321,12 +325,16 @@ class storage_pool
321325
//! happily use any partition you feed it, including the system drive.
322326
uint32_t disable_mismatching_storage_pool_check : 1;
323327

328+
//! Number of conventional chunks to allocate per device. Default is 3.
329+
uint32_t num_cnv_chunks;
330+
324331
constexpr creation_flags()
325332
: chunk_capacity(28)
326333
, interleave_chunks_evenly(false)
327334
, open_read_only(false)
328335
, open_read_only_allow_dirty(false)
329336
, disable_mismatching_storage_pool_check(false)
337+
, num_cnv_chunks(3)
330338
{
331339
}
332340
};

category/mpt/cli_tool_impl.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,7 @@ struct impl_t
393393
std::ostream &cerr;
394394
MONAD_ASYNC_NAMESPACE::storage_pool::creation_flags flags;
395395
uint8_t chunk_capacity = flags.chunk_capacity;
396+
uint32_t root_offsets_chunk_count = 16;
396397
bool allow_dirty = false;
397398
bool no_prompt = false;
398399
bool create_database = false;
@@ -1445,6 +1446,22 @@ opened.
14451446
"set chunk capacity during database creation (default is 28, "
14461447
"1<<28 "
14471448
"= 256Mb, max is 31).");
1449+
cli.add_option(
1450+
"--root-offsets-chunk-count",
1451+
impl.root_offsets_chunk_count,
1452+
"Number of chunks to allocate for storing root offsets. "
1453+
"Must be a positive number that is a power of 2. Default is "
1454+
"16. Each chunk holds approx 16.5M history entries.")
1455+
->check([](std::string const &s) {
1456+
auto const v = std::stoll(s);
1457+
if (v <= 0) {
1458+
return "Value must be positive";
1459+
}
1460+
if ((v & (v - 1)) != 0) {
1461+
return "Value must be a power of 2";
1462+
}
1463+
return "";
1464+
});
14481465
cli.add_flag(
14491466
"--chunk-increasing",
14501467
impl.create_chunk_increasing,
@@ -1478,6 +1495,9 @@ opened.
14781495
impl.flags.open_read_only = true;
14791496
impl.flags.open_read_only_allow_dirty =
14801497
impl.allow_dirty || !impl.archive_database.empty();
1498+
impl.flags.num_cnv_chunks =
1499+
impl.root_offsets_chunk_count +
1500+
monad::mpt::UpdateAuxImpl::cnv_chunks_for_db_metadata;
14811501
if (!impl.restore_database.empty()) {
14821502
if (!impl.archive_database.empty()) {
14831503
impl.cli_ask_question(

category/mpt/db.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,10 +122,12 @@ AsyncIOContext::AsyncIOContext(ReadOnlyOnDiskDbConfig const &options)
122122

123123
AsyncIOContext::AsyncIOContext(OnDiskDbConfig const &options)
124124
: pool{[&] -> async::storage_pool {
125+
async::storage_pool::creation_flags pool_options;
126+
pool_options.num_cnv_chunks = options.root_offsets_chunk_count + 1;
125127
auto len = options.file_size_db * 1024 * 1024 * 1024 + 24576;
126128
if (options.dbname_paths.empty()) {
127129
return async::storage_pool{
128-
async::use_anonymous_sized_inode_tag{}, len};
130+
async::use_anonymous_sized_inode_tag{}, len, pool_options};
129131
}
130132
// initialize db file on disk
131133
for (auto const &dbname_path : options.dbname_paths) {
@@ -145,7 +147,8 @@ AsyncIOContext::AsyncIOContext(OnDiskDbConfig const &options)
145147
return async::storage_pool{
146148
options.dbname_paths,
147149
options.append ? async::storage_pool::mode::open_existing
148-
: async::storage_pool::mode::truncate};
150+
: async::storage_pool::mode::truncate,
151+
pool_options};
149152
}()}
150153
, read_ring{{options.uring_entries, options.sq_thread_cpu}}
151154
, write_ring{io::RingConfig{options.wr_buffers}}

category/mpt/ondisk_db_config.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,10 @@ struct OnDiskDbConfig
4343
// fixed history length if contains value, otherwise rely on db to adjust
4444
// history length upon disk usage
4545
std::optional<uint64_t> fixed_history_length{std::nullopt};
46+
// Number of chunks to allocate for root offsets when initializing the disk.
47+
// Each chunk can hold 1 << 24 = 16777216 historical entries.
48+
// This field must be power of 2.
49+
uint32_t root_offsets_chunk_count{16};
4650
};
4751

4852
struct ReadOnlyOnDiskDbConfig

category/mpt/test/cli_tool_test.cpp

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ TEST(cli_tool, create)
7373
::close(fd);
7474
auto untempfile =
7575
monad::make_scope_exit([&]() noexcept { unlink(temppath); });
76-
if (-1 == truncate(temppath, 2ULL * 1024 * 1024 * 1024)) {
76+
if (-1 == truncate(temppath, 6ULL * 1024 * 1024 * 1024)) {
7777
abort();
7878
}
7979
std::cout << "temp file being used: " << temppath << std::endl;
@@ -108,6 +108,8 @@ struct cli_tool_fixture
108108
{
109109
void run_test()
110110
{
111+
constexpr unsigned default_num_cnv_chunks = 17;
112+
111113
char temppath1[] = "cli_tool_test_XXXXXX";
112114
char dbpath2a[] = "cli_tool_test_XXXXXX";
113115
char dbpath2b[] = "cli_tool_test_XXXXXX";
@@ -152,15 +154,15 @@ struct cli_tool_fixture
152154
if (Config.interleave_multiple_sources) {
153155
if (-1 == truncate(
154156
dbpath2a,
155-
(4 + Config.chunks_max / 2) *
157+
(default_num_cnv_chunks + Config.chunks_max / 2) *
156158
MONAD_ASYNC_NAMESPACE::AsyncIO::
157159
MONAD_IO_BUFFERS_WRITE_SIZE +
158160
24576)) {
159161
abort();
160162
}
161163
if (-1 == truncate(
162164
dbpath2b,
163-
(4 + Config.chunks_max / 2) *
165+
(default_num_cnv_chunks + Config.chunks_max / 2) *
164166
MONAD_ASYNC_NAMESPACE::AsyncIO::
165167
MONAD_IO_BUFFERS_WRITE_SIZE +
166168
24576)) {
@@ -170,12 +172,12 @@ struct cli_tool_fixture
170172
dbpath2.push_back(dbpath2b);
171173
}
172174
else {
173-
if (-1 ==
174-
truncate(
175-
dbpath2a,
176-
(3 + Config.chunks_max) * MONAD_ASYNC_NAMESPACE::AsyncIO::
177-
MONAD_IO_BUFFERS_WRITE_SIZE +
178-
24576)) {
175+
if (-1 == truncate(
176+
dbpath2a,
177+
(default_num_cnv_chunks + Config.chunks_max) *
178+
MONAD_ASYNC_NAMESPACE::AsyncIO::
179+
MONAD_IO_BUFFERS_WRITE_SIZE +
180+
24576)) {
179181
abort();
180182
}
181183
dbpath2.push_back(dbpath2a);
@@ -257,12 +259,12 @@ struct cli_tool_fixture
257259
if (-1 == fd) {
258260
abort();
259261
}
260-
if (-1 ==
261-
ftruncate(
262-
fd,
263-
(3 + Config.chunks_max) * MONAD_ASYNC_NAMESPACE::AsyncIO::
264-
MONAD_IO_BUFFERS_WRITE_SIZE +
265-
24576)) {
262+
if (-1 == ftruncate(
263+
fd,
264+
(default_num_cnv_chunks + Config.chunks_max) *
265+
MONAD_ASYNC_NAMESPACE::AsyncIO::
266+
MONAD_IO_BUFFERS_WRITE_SIZE +
267+
24576)) {
266268
abort();
267269
}
268270
::close(fd);

category/mpt/test/db_test.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -816,7 +816,8 @@ TEST(DbTest, history_length_adjustment_never_under_min)
816816
OnDiskDbConfig config{
817817
.compaction = true,
818818
.sq_thread_cpu{std::nullopt},
819-
.dbname_paths = {dbname}};
819+
.dbname_paths = {dbname},
820+
.root_offsets_chunk_count = 2};
820821
Db db{machine, config};
821822
Node::SharedPtr root{};
822823

@@ -1162,7 +1163,8 @@ TEST(DbTest, out_of_order_upserts_to_nonexist_earlier_version)
11621163
.compaction = true,
11631164
.sq_thread_cpu{std::nullopt},
11641165
.dbname_paths = {dbname},
1165-
.fixed_history_length = MPT_TEST_HISTORY_LENGTH};
1166+
.fixed_history_length = MPT_TEST_HISTORY_LENGTH,
1167+
.root_offsets_chunk_count = 2};
11661168
Db db{machine, config};
11671169

11681170
AsyncIOContext io_ctx{ReadOnlyOnDiskDbConfig{.dbname_paths = {dbname}}};
@@ -1224,7 +1226,8 @@ TEST(DbTest, out_of_order_upserts_with_compaction)
12241226
.compaction = true,
12251227
.sq_thread_cpu{std::nullopt},
12261228
.dbname_paths = {dbname},
1227-
.fixed_history_length = MPT_TEST_HISTORY_LENGTH};
1229+
.fixed_history_length = MPT_TEST_HISTORY_LENGTH,
1230+
.root_offsets_chunk_count = 2};
12281231
Db db{machine, config};
12291232
AsyncIOContext io_ctx{ReadOnlyOnDiskDbConfig{.dbname_paths = {dbname}}};
12301233
Db rodb{io_ctx};

0 commit comments

Comments
 (0)