-
Notifications
You must be signed in to change notification settings - Fork 1k
Check # CPUs and memory size on startup #5128
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -57,6 +57,7 @@ | |
| #include <iostream> | ||
| #include <lib/clara.hpp> | ||
| #include <optional> | ||
| #include <thread> | ||
|
|
||
| namespace stellar | ||
| { | ||
|
|
@@ -1607,6 +1608,121 @@ runReportLastHistoryCheckpoint(CommandLineArgs const& args) | |
| }); | ||
| } | ||
|
|
||
| namespace | ||
| { | ||
| // Before starting the application, we want to check that the host machine meets | ||
| // the minimum system requirements we document (16 GiB RAM, 8 vCPUs). To make | ||
| // the check meaningful, we prevent the node from starting when it fails | ||
| // (instead of, e.g., writing a log message that may go unread). In the case | ||
| // that we aren't able to automatically determine the relevant system | ||
| // information, we allow the operator to set config values with their system | ||
| // information. Note: we intentionally have these as integer values instead of | ||
| // booleans so that there isn't a silent failure if we bump the minimum | ||
| // requirements. If the auto-detected (or operator-provided, in the case of | ||
| // auto-detection failure) system information doesn't meet the minimum | ||
| // requirements, we require the operator to set an additional config value to | ||
| // explicitly acknowledge that they are ignoring the warning. | ||
|
|
||
| // We want to make the flag value annoying enough to set that the operators have | ||
| // to make an intentional and continuous decision to ignore the warning. We use | ||
| // the version to make sure that every time they upgrade the package, they have | ||
| // to make a new decision to ignore the warning, and we use the public key to | ||
| // make sure that the value is unique per node. Notably, we don't use something | ||
| // that depends on the current time so that restarts after crashes are handled | ||
| // gracefully (assuming the package wasn't upgraded in between). | ||
| bool | ||
| validateSystemInfo(Config const& cfg) | ||
| { | ||
| std::string annoyingValue = | ||
| fmt::format(FMT_STRING("{}-{}"), STELLAR_CORE_VERSION, | ||
| KeyUtils::toStrKey(cfg.NODE_SEED.getPublicKey())); | ||
|
|
||
| uint64_t memory = rust_bridge::get_host_total_memory(); | ||
| if (memory == 0) | ||
| { | ||
| if (!cfg.SYSCHECK_UNKNOWN_MEMORY_DEFAULT) | ||
| { | ||
| LOG_ERROR(DEFAULT_LOG, | ||
| "Unable to determine total memory of the host; please " | ||
| "ensure that the system has at least 16 GiB of RAM. Once " | ||
| "confirmed, set SYSCHECK_UNKNOWN_MEMORY_DEFAULT to the " | ||
| "size of RAM in KiB."); | ||
| return false; | ||
| } | ||
|
|
||
| LOG_WARNING(DEFAULT_LOG, | ||
| "Unable to determine total memory of the host; using " | ||
| "SYSCHECK_UNKNOWN_MEMORY_DEFAULT value of {} KiB for " | ||
| "checks. Please ensure this is still the correct value.", | ||
| cfg.SYSCHECK_UNKNOWN_MEMORY_DEFAULT); | ||
|
|
||
| memory = cfg.SYSCHECK_UNKNOWN_MEMORY_DEFAULT; | ||
| } | ||
|
|
||
| if (memory < static_cast<uint32_t>(16) * 1024 * 1024) | ||
| { | ||
| if (cfg.SYSCHECK_FORCE_IGNORE_MEMORY != annoyingValue) | ||
| { | ||
| LOG_ERROR( | ||
| DEFAULT_LOG, | ||
| "Host only has {} KiB of RAM; stellar-core may not function " | ||
| "properly under heavy load; please ensure that the system has " | ||
| "at least 16 GiB of RAM. To force ignore this warning, set " | ||
| "SYSCHECK_FORCE_IGNORE_MEMORY to \"{}\". Note that this value " | ||
| "differs for every node and version.", | ||
| memory, annoyingValue); | ||
| return false; | ||
| } | ||
| LOG_WARNING( | ||
| DEFAULT_LOG, | ||
| "Host only has {} KiB of RAM; the recommended minimum is 16 GiB", | ||
| memory); | ||
| } | ||
|
|
||
| unsigned int cpus = std::thread::hardware_concurrency(); | ||
| if (cpus == 0) | ||
| { | ||
| if (!cfg.SYSCHECK_UNKNOWN_CPU_DEFAULT) | ||
| { | ||
| LOG_ERROR( | ||
| DEFAULT_LOG, | ||
| "Unable to determine number of vCPUs of the host; please " | ||
| "ensure that the system has at least 8 vCPUs. Once confirmed, " | ||
| "set SYSCHECK_UNKNOWN_CPU_DEFAULT to the number of vCPUs."); | ||
| return false; | ||
| } | ||
|
|
||
| LOG_WARNING(DEFAULT_LOG, | ||
| "Unable to determine number of vCPUs of the host; using " | ||
| "SYSCHECK_UNKNOWN_CPU_DEFAULT value of {} for checks. " | ||
| "Please ensure this is still the correct value.", | ||
| cfg.SYSCHECK_UNKNOWN_CPU_DEFAULT); | ||
|
|
||
| cpus = cfg.SYSCHECK_UNKNOWN_CPU_DEFAULT; | ||
| } | ||
|
|
||
| if (cpus < 8) | ||
| { | ||
| if (cfg.SYSCHECK_FORCE_IGNORE_CPU != annoyingValue) | ||
| { | ||
| LOG_ERROR(DEFAULT_LOG, | ||
|
Comment on lines
+1704
to
+1708
|
||
| "Host only has {} vCPUs; stellar-core may not function " | ||
| "properly under heavy load; please ensure that the " | ||
| "system has at least 8 vCPUs. To force ignore this " | ||
| "warning, set SYSCHECK_FORCE_IGNORE_CPU to \"{}\". Note " | ||
| "that value differs for every node and version.", | ||
| cpus, annoyingValue); | ||
| return false; | ||
| } | ||
| LOG_WARNING(DEFAULT_LOG, | ||
| "Host only has {} vCPUs; the recommended minimum is 8", | ||
| cpus); | ||
| } | ||
|
|
||
| return true; | ||
| } | ||
| } // namespace | ||
|
|
||
| int | ||
| run(CommandLineArgs const& args) | ||
| { | ||
|
|
@@ -1682,6 +1798,16 @@ run(CommandLineArgs const& args) | |
| "enabled (for testing only)"); | ||
| } | ||
|
|
||
| if (gIsProductionNetwork && cfg.NODE_IS_VALIDATOR && | ||
| !validateSystemInfo(cfg)) | ||
| { | ||
| LOG_ERROR( | ||
| DEFAULT_LOG, | ||
| "Host system does not meet the minimum requirements " | ||
| "for running stellar-core. Exiting."); | ||
| return 1; | ||
| } | ||
|
|
||
| // Second, setup the app with the final configuration. | ||
| clock = std::make_shared<VirtualClock>(clockMode); | ||
| app = setupApp(cfg, *clock); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -672,6 +672,14 @@ class Config : public std::enable_shared_from_this<Config> | |
| bool LOG_COLOR; | ||
| std::string BUCKET_DIR_PATH; | ||
|
|
||
| // Config parameters controlling startup system info validation (only | ||
| // relevant for mainnet validators). See the comment above | ||
| // validateSystemInfo in CommandLine.cpp for details. | ||
| uint32_t SYSCHECK_UNKNOWN_MEMORY_DEFAULT{0}; | ||
| uint32_t SYSCHECK_UNKNOWN_CPU_DEFAULT{0}; | ||
| std::string SYSCHECK_FORCE_IGNORE_MEMORY; | ||
| std::string SYSCHECK_FORCE_IGNORE_CPU; | ||
|
Comment on lines
+675
to
+681
|
||
|
|
||
| // Path to Protocol 23 corruption CSV file for testing/recovery | ||
| std::string PATH_TO_PROTOCOL_23_CORRUPTION_FILE; | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The minimum RAM requirement (16 GiB) is duplicated across the numeric check and multiple log strings. To reduce the chance of inconsistencies if requirements change, consider introducing a named constant (e.g.,
MIN_VALIDATOR_RAM_KIB) and using it for both the comparison and messaging.