forked from ray-project/ray
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[xray] Monitor for Raylet processes (ray-project#1831)
* Add raylet monitor script to timeout Raylet heartbeats * Unit test for removing a different client from the client table * Set node manager heartbeat according to global config * Doc and fixes * Add regression test for client table disconnect, refactor client table * Fix linting.
- Loading branch information
1 parent
0d9a7a3
commit cbf3181
Showing
11 changed files
with
274 additions
and
29 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
#include "ray/raylet/monitor.h" | ||
|
||
#include "ray/status.h" | ||
|
||
namespace ray { | ||
|
||
namespace raylet { | ||
|
||
/// \class Monitor | ||
/// | ||
/// The monitor is responsible for listening for heartbeats from Raylets and | ||
/// deciding when a Raylet has died. If the monitor does not hear from a Raylet | ||
/// within heartbeat_timeout_milliseconds * num_heartbeats_timeout (defined in | ||
/// the Ray configuration), then the monitor will mark that Raylet as dead in | ||
/// the client table, which broadcasts the event to all other Raylets. | ||
Monitor::Monitor(boost::asio::io_service &io_service, const std::string &redis_address, | ||
int redis_port) | ||
: gcs_client_(), | ||
heartbeat_timeout_ms_(RayConfig::instance().num_heartbeats_timeout()), | ||
heartbeat_timer_(io_service) { | ||
RAY_CHECK_OK(gcs_client_.Connect(redis_address, redis_port)); | ||
RAY_CHECK_OK(gcs_client_.Attach(io_service)); | ||
} | ||
|
||
void Monitor::HandleHeartbeat(const ClientID &client_id) { | ||
heartbeats_[client_id] = heartbeat_timeout_ms_; | ||
} | ||
|
||
void Monitor::Start() { | ||
const auto heartbeat_callback = [this](gcs::AsyncGcsClient *client, const ClientID &id, | ||
const HeartbeatTableDataT &heartbeat_data) { | ||
HandleHeartbeat(id); | ||
}; | ||
RAY_CHECK_OK(gcs_client_.heartbeat_table().Subscribe(UniqueID::nil(), UniqueID::nil(), | ||
heartbeat_callback, nullptr)); | ||
Tick(); | ||
} | ||
|
||
/// A periodic timer that checks for timed out clients. | ||
void Monitor::Tick() { | ||
for (auto it = heartbeats_.begin(); it != heartbeats_.end();) { | ||
it->second--; | ||
if (it->second == 0) { | ||
if (dead_clients_.count(it->first) == 0) { | ||
RAY_LOG(WARNING) << "Client timed out: " << it->first; | ||
RAY_CHECK_OK(gcs_client_.client_table().MarkDisconnected(it->first)); | ||
dead_clients_.insert(it->first); | ||
} | ||
it = heartbeats_.erase(it); | ||
} else { | ||
it++; | ||
} | ||
} | ||
|
||
auto heartbeat_period = boost::posix_time::milliseconds( | ||
RayConfig::instance().heartbeat_timeout_milliseconds()); | ||
heartbeat_timer_.expires_from_now(heartbeat_period); | ||
heartbeat_timer_.async_wait([this](const boost::system::error_code &error) { | ||
RAY_CHECK(!error); | ||
Tick(); | ||
}); | ||
} | ||
|
||
} // namespace raylet | ||
|
||
} // namespace ray |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
#ifndef RAY_RAYLET_MONITOR_H | ||
#define RAY_RAYLET_MONITOR_H | ||
|
||
#include <memory> | ||
#include <unordered_set> | ||
|
||
#include "ray/gcs/client.h" | ||
#include "ray/id.h" | ||
|
||
namespace ray { | ||
|
||
namespace raylet { | ||
|
||
class Monitor { | ||
public: | ||
/// Create a Raylet monitor attached to the given GCS address and port. | ||
/// | ||
/// \param io_service The event loop to run the monitor on. | ||
/// \param redis_address The GCS Redis address to connect to. | ||
/// \param redis_port The GCS Redis port to connect to. | ||
Monitor(boost::asio::io_service &io_service, const std::string &redis_address, | ||
int redis_port); | ||
|
||
/// Start the monitor. Listen for heartbeats from Raylets and mark Raylets | ||
/// that do not send a heartbeat within a given period as dead. | ||
void Start(); | ||
|
||
/// A periodic timer that fires on every heartbeat period. Raylets that have | ||
/// not sent a heartbeat within the last num_heartbeats_timeout ticks will be | ||
/// marked as dead in the client table. | ||
void Tick(); | ||
|
||
/// Handle a heartbeat from a Raylet. | ||
/// | ||
/// \param client_id The client ID of the Raylet that sent the heartbeat. | ||
void HandleHeartbeat(const ClientID &client_id); | ||
|
||
private: | ||
/// A client to the GCS, through which heartbeats are received. | ||
gcs::AsyncGcsClient gcs_client_; | ||
/// The expected period between heartbeats, for an individual Raylet. | ||
int64_t heartbeat_timeout_ms_; | ||
/// A timer that ticks every heartbeat_timeout_ms_ milliseconds. | ||
boost::asio::deadline_timer heartbeat_timer_; | ||
/// For each Raylet that we receive a heartbeat from, the number of ticks | ||
/// that may pass before the Raylet will be declared dead. | ||
std::unordered_map<ClientID, int64_t, UniqueIDHasher> heartbeats_; | ||
/// The Raylets that have been marked as dead in the client table. | ||
std::unordered_set<ClientID, UniqueIDHasher> dead_clients_; | ||
}; | ||
|
||
} // namespace raylet | ||
|
||
} // namespace ray | ||
|
||
#endif // RAY_RAYLET_MONITOR_H |
Oops, something went wrong.