From 6ddb28cfe77cac0f590facb7b3942f41afc439bd Mon Sep 17 00:00:00 2001 From: Matt Klein Date: Fri, 4 Nov 2016 13:53:16 -0700 Subject: [PATCH] add --drain-time-s and --parent-shutdown-time-s CLI options (#195) Allows configurable drain and shutdown times during hot restart. --- docs/intro/arch_overview/hot_restart.rst | 6 ++++-- docs/operations/cli.rst | 16 ++++++++++++++++ include/envoy/server/options.h | 11 +++++++++++ source/server/drain_manager_impl.cc | 13 +++++-------- source/server/drain_manager_impl.h | 3 --- source/server/options_impl.cc | 7 +++++++ source/server/options_impl.h | 4 ++++ test/integration/server.h | 2 ++ test/mocks/server/mocks.h | 2 ++ test/server/drain_manager_impl_test.cc | 4 +++- test/server/options_impl_test.cc | 16 ++++++++++++++++ 11 files changed, 70 insertions(+), 14 deletions(-) diff --git a/docs/intro/arch_overview/hot_restart.rst b/docs/intro/arch_overview/hot_restart.rst index 55d22e02896f..980eb372547f 100644 --- a/docs/intro/arch_overview/hot_restart.rst +++ b/docs/intro/arch_overview/hot_restart.rst @@ -17,8 +17,10 @@ hot restart functionality has has the following general architecture: the old process. The new process starts listening and then tells the old process to start draining. * During the draining phase, the old process attempts to gracefully close existing connections. How - this is done depends on the configured filters. The drain time is configurable and as more time - passes draining becomes more aggressive. + this is done depends on the configured filters. The drain time is configurable via the + :option:`--drain-time-s` option and as more time passes draining becomes more aggressive. +* After drain sequence, the new Envoy process tells the old Envoy process to shut itself down. + This time is configurable via the :option:`--parent-shutdown-time-s` option. * Envoy’s hot restart support was designed so that it will work correctly even if the new Envoy process and the old Envoy process are running inside different containers. Communication between the processes takes place only using unix domain sockets. diff --git a/docs/operations/cli.rst b/docs/operations/cli.rst index f6d4a55d0a35..6c94533189bd 100644 --- a/docs/operations/cli.rst +++ b/docs/operations/cli.rst @@ -70,3 +70,19 @@ following are the command line options that Envoy supports. the interval has elapsed, whichever comes first. Adjusting this setting is useful when tailing :ref:`access logs ` in order to get more (or less) immediate flushing. + +.. option:: --drain-time-s + + *(optional)* The time in seconds that Envoy will drain connections during a hot restart. See the + :ref:`hot restart overview ` for more information. Defaults to 600 + seconds (10 minutes). Generally the drain time should be less than the parent shutdown time + set via the :option:`--parent-shutdown-time-s` option. How the two settings are configured + depends on the specific deployment. In edge scenarios, it might be desirable to have a very long + drain time. In service to service scenarios, it might be possible to make the drain and shutdown + time much shorter (e.g., 60s/90s). + +.. option:: --parent-shutdown-time-s + + *(optional)* The time in seconds that Envoy will wait before shutting down the parent process + during a hot restart. See the :ref:`hot restart overview ` for more + information. Defaults to 900 seconds (15 minutes). diff --git a/include/envoy/server/options.h b/include/envoy/server/options.h index 8e0e378c5587..1f57c5325fb9 100644 --- a/include/envoy/server/options.h +++ b/include/envoy/server/options.h @@ -24,6 +24,11 @@ class Options { */ virtual uint32_t concurrency() PURE; + /** + * @return the number of seconds that envoy will perform draining during a hot restart. + */ + virtual std::chrono::seconds drainTime() PURE; + /** * @return const std::string& the path to the configuration file. */ @@ -34,6 +39,12 @@ class Options { */ virtual spdlog::level::level_enum logLevel() PURE; + /** + * @return the number of seconds that envoy will wait before shutting down the parent envoy during + * a host restart. Generally this will be longer than the drainTime() option. + */ + virtual std::chrono::seconds parentShutdownTime() PURE; + /** * @return the restart epoch. 0 indicates the first server start, 1 the second, and so on. */ diff --git a/source/server/drain_manager_impl.cc b/source/server/drain_manager_impl.cc index 7e2b42fa9550..7634105f5d35 100644 --- a/source/server/drain_manager_impl.cc +++ b/source/server/drain_manager_impl.cc @@ -9,9 +9,6 @@ namespace Server { -const std::chrono::minutes DrainManagerImpl::DEFAULT_DRAIN_TIME{10}; -const std::chrono::minutes DrainManagerImpl::DEFAULT_PARENT_SHUTDOWN_TIME{15}; - DrainManagerImpl::DrainManagerImpl(Instance& server) : server_(server) {} bool DrainManagerImpl::drainClose() { @@ -26,15 +23,15 @@ bool DrainManagerImpl::drainClose() { // We use the tick time as in increasing chance that we shutdown connections. return static_cast(drain_time_completed_.count()) > - (server_.random().random() % DEFAULT_DRAIN_TIME.count()); + (server_.random().random() % server_.options().drainTime().count()); } void DrainManagerImpl::drainSequenceTick() { log_trace("drain tick #{}", drain_time_completed_.count()); - ASSERT(drain_time_completed_ < DEFAULT_DRAIN_TIME); + ASSERT(drain_time_completed_ < server_.options().drainTime()); drain_time_completed_ += std::chrono::seconds(1); - if (drain_time_completed_ < DEFAULT_DRAIN_TIME) { + if (drain_time_completed_ < server_.options().drainTime()) { drain_tick_timer_->enableTimer(std::chrono::milliseconds(1000)); } } @@ -53,8 +50,8 @@ void DrainManagerImpl::startParentShutdownSequence() { server_.hotRestart().terminateParent(); }); - parent_shutdown_timer_->enableTimer( - std::chrono::duration_cast(DEFAULT_PARENT_SHUTDOWN_TIME)); + parent_shutdown_timer_->enableTimer(std::chrono::duration_cast( + server_.options().parentShutdownTime())); } } // Server diff --git a/source/server/drain_manager_impl.h b/source/server/drain_manager_impl.h index 4129b11c408a..238f96804a30 100644 --- a/source/server/drain_manager_impl.h +++ b/source/server/drain_manager_impl.h @@ -26,9 +26,6 @@ class DrainManagerImpl : Logger::Loggable, public DrainManager private: void drainSequenceTick(); - static const std::chrono::minutes DEFAULT_DRAIN_TIME; - static const std::chrono::minutes DEFAULT_PARENT_SHUTDOWN_TIME; - Instance& server_; Event::TimerPtr drain_tick_timer_; std::chrono::seconds drain_time_completed_{}; diff --git a/source/server/options_impl.cc b/source/server/options_impl.cc index b9712c5e51da..344330b4b39a 100644 --- a/source/server/options_impl.cc +++ b/source/server/options_impl.cc @@ -39,6 +39,11 @@ OptionsImpl::OptionsImpl(int argc, char** argv, const std::string& hot_restart_v TCLAP::ValueArg file_flush_interval_msec("", "file-flush-interval-msec", "Interval for log flushing in msec", false, 10000, "uint64_t", cmd); + TCLAP::ValueArg drain_time_s("", "drain-time-s", "Hot restart drain time in seconds", + false, 600, "uint64_t", cmd); + TCLAP::ValueArg parent_shutdown_time_s("", "parent-shutdown-time-s", + "Hot restart parent shutdown time in seconds", + false, 900, "uint64_t", cmd); try { cmd.parse(argc, argv); @@ -68,4 +73,6 @@ OptionsImpl::OptionsImpl(int argc, char** argv, const std::string& hot_restart_v service_node_ = service_node.getValue(); service_zone_ = service_zone.getValue(); file_flush_interval_msec_ = std::chrono::milliseconds(file_flush_interval_msec.getValue()); + drain_time_ = std::chrono::seconds(drain_time_s.getValue()); + parent_shutdown_time_ = std::chrono::seconds(parent_shutdown_time_s.getValue()); } diff --git a/source/server/options_impl.h b/source/server/options_impl.h index c509554ade1c..69c44601b1dc 100644 --- a/source/server/options_impl.h +++ b/source/server/options_impl.h @@ -14,7 +14,9 @@ class OptionsImpl : public Server::Options { uint64_t baseId() { return base_id_; } uint32_t concurrency() override { return concurrency_; } const std::string& configPath() override { return config_path_; } + std::chrono::seconds drainTime() override { return drain_time_; } spdlog::level::level_enum logLevel() override { return log_level_; } + std::chrono::seconds parentShutdownTime() override { return parent_shutdown_time_; } uint64_t restartEpoch() override { return restart_epoch_; } const std::string& serviceClusterName() override { return service_cluster_; } const std::string& serviceNodeName() override { return service_node_; } @@ -31,4 +33,6 @@ class OptionsImpl : public Server::Options { std::string service_node_; std::string service_zone_; std::chrono::milliseconds file_flush_interval_msec_; + std::chrono::seconds drain_time_; + std::chrono::seconds parent_shutdown_time_; }; diff --git a/test/integration/server.h b/test/integration/server.h index 7244f479700c..c0bfc2fe340f 100644 --- a/test/integration/server.h +++ b/test/integration/server.h @@ -22,7 +22,9 @@ class TestOptionsImpl : public Options { uint64_t baseId() override { return 0; } uint32_t concurrency() override { return 1; } const std::string& configPath() override { return config_path_; } + std::chrono::seconds drainTime() override { return std::chrono::seconds(0); } spdlog::level::level_enum logLevel() override { NOT_IMPLEMENTED; } + std::chrono::seconds parentShutdownTime() override { return std::chrono::seconds(0); } uint64_t restartEpoch() override { return 0; } const std::string& serviceClusterName() override { return cluster_name_; } const std::string& serviceNodeName() override { return node_name_; } diff --git a/test/mocks/server/mocks.h b/test/mocks/server/mocks.h index 2cb0763f9b2d..641bb512f064 100644 --- a/test/mocks/server/mocks.h +++ b/test/mocks/server/mocks.h @@ -32,7 +32,9 @@ class MockOptions : public Options { MOCK_METHOD0(baseId, uint64_t()); MOCK_METHOD0(concurrency, uint32_t()); MOCK_METHOD0(configPath, const std::string&()); + MOCK_METHOD0(drainTime, std::chrono::seconds()); MOCK_METHOD0(logLevel, spdlog::level::level_enum()); + MOCK_METHOD0(parentShutdownTime, std::chrono::seconds()); MOCK_METHOD0(restartEpoch, uint64_t()); MOCK_METHOD0(serviceClusterName, const std::string&()); MOCK_METHOD0(serviceNodeName, const std::string&()); diff --git a/test/server/drain_manager_impl_test.cc b/test/server/drain_manager_impl_test.cc index e386aa190bb6..6e3ec787463c 100644 --- a/test/server/drain_manager_impl_test.cc +++ b/test/server/drain_manager_impl_test.cc @@ -10,11 +10,13 @@ namespace Server { TEST(DrainManagerImplTest, All) { NiceMock server; + ON_CALL(server.options_, drainTime()).WillByDefault(Return(std::chrono::seconds(600))); + ON_CALL(server.options_, parentShutdownTime()).WillByDefault(Return(std::chrono::seconds(900))); DrainManagerImpl drain_manager(server); // Test parent shutdown. Event::MockTimer* shutdown_timer = new Event::MockTimer(&server.dispatcher_); - EXPECT_CALL(*shutdown_timer, enableTimer(_)); + EXPECT_CALL(*shutdown_timer, enableTimer(std::chrono::milliseconds(900000))); drain_manager.startParentShutdownSequence(); EXPECT_CALL(server.hot_restart_, terminateParent()); diff --git a/test/server/options_impl_test.cc b/test/server/options_impl_test.cc index 9935fb8808c1..4d68a6403d9a 100644 --- a/test/server/options_impl_test.cc +++ b/test/server/options_impl_test.cc @@ -19,6 +19,10 @@ TEST(OptionsImplTest, All) { argv.push_back("zone"); argv.push_back("--file-flush-interval-msec"); argv.push_back("9000"); + argv.push_back("--drain-time-s"); + argv.push_back("60"); + argv.push_back("--parent-shutdown-time-s"); + argv.push_back("90"); OptionsImpl options(argv.size(), const_cast(&argv[0]), "1", spdlog::level::warn); EXPECT_EQ(2U, options.concurrency()); EXPECT_EQ("hello", options.configPath()); @@ -28,4 +32,16 @@ TEST(OptionsImplTest, All) { EXPECT_EQ("node", options.serviceNodeName()); EXPECT_EQ("zone", options.serviceZone()); EXPECT_EQ(std::chrono::milliseconds(9000), options.fileFlushIntervalMsec()); + EXPECT_EQ(std::chrono::seconds(60), options.drainTime()); + EXPECT_EQ(std::chrono::seconds(90), options.parentShutdownTime()); +} + +TEST(OptionsImplTest, DefaultParams) { + std::vector argv; + argv.push_back("envoy"); + argv.push_back("-c"); + argv.push_back("hello"); + OptionsImpl options(argv.size(), const_cast(&argv[0]), "1", spdlog::level::warn); + EXPECT_EQ(std::chrono::seconds(600), options.drainTime()); + EXPECT_EQ(std::chrono::seconds(900), options.parentShutdownTime()); }