Skip to content

Commit cbf3181

Browse files
stephanie-wangrobertnishihara
authored andcommitted
[xray] Monitor for Raylet processes (#1831)
* Add raylet monitor script to timeout Raylet heartbeats * Unit test for removing a different client from the client table * Set node manager heartbeat according to global config * Doc and fixes * Add regression test for client table disconnect, refactor client table * Fix linting.
1 parent 0d9a7a3 commit cbf3181

File tree

11 files changed

+274
-29
lines changed

11 files changed

+274
-29
lines changed

python/ray/services.py

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,10 @@
7777
os.path.abspath(os.path.dirname(__file__)),
7878
"core/src/credis/build/src/libmember.so")
7979

80-
# Location of the raylet executable.
80+
# Location of the raylet executables.
81+
RAYLET_MONITOR_EXECUTABLE = os.path.join(
82+
os.path.abspath(os.path.dirname(__file__)),
83+
"core/src/ray/raylet/raylet_monitor")
8184
RAYLET_EXECUTABLE = os.path.join(
8285
os.path.abspath(os.path.dirname(__file__)),
8386
"core/src/ray/raylet/raylet")
@@ -1112,11 +1115,35 @@ def start_monitor(redis_address, node_ip_address, stdout_file=None,
11121115
command.append("--autoscaling-config=" + str(autoscaling_config))
11131116
p = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
11141117
if cleanup:
1115-
all_processes[PROCESS_TYPE_WORKER].append(p)
1118+
all_processes[PROCESS_TYPE_MONITOR].append(p)
11161119
record_log_files_in_redis(redis_address, node_ip_address,
11171120
[stdout_file, stderr_file])
11181121

11191122

1123+
def start_raylet_monitor(redis_address, stdout_file=None,
1124+
stderr_file=None, cleanup=True):
1125+
"""Run a process to monitor the other processes.
1126+
1127+
Args:
1128+
redis_address (str): The address that the Redis server is listening on.
1129+
stdout_file: A file handle opened for writing to redirect stdout to. If
1130+
no redirection should happen, then this should be None.
1131+
stderr_file: A file handle opened for writing to redirect stderr to. If
1132+
no redirection should happen, then this should be None.
1133+
cleanup (bool): True if using Ray in local mode. If cleanup is true,
1134+
then this process will be killed by services.cleanup() when the
1135+
Python process that imported services exits. This is True by
1136+
default.
1137+
"""
1138+
gcs_ip_address, gcs_port = redis_address.split(":")
1139+
command = [RAYLET_MONITOR_EXECUTABLE,
1140+
gcs_ip_address,
1141+
gcs_port]
1142+
p = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
1143+
if cleanup:
1144+
all_processes[PROCESS_TYPE_MONITOR].append(p)
1145+
1146+
11201147
def start_ray_processes(address_info=None,
11211148
node_ip_address="127.0.0.1",
11221149
redis_port=None,
@@ -1253,6 +1280,11 @@ def start_ray_processes(address_info=None,
12531280
stderr_file=monitor_stderr_file,
12541281
cleanup=cleanup,
12551282
autoscaling_config=autoscaling_config)
1283+
if use_raylet:
1284+
start_raylet_monitor(redis_address,
1285+
stdout_file=monitor_stdout_file,
1286+
stderr_file=monitor_stderr_file,
1287+
cleanup=cleanup)
12561288

12571289
if redis_shards == []:
12581290
# Get redis shards from primary redis instance.

python/setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
"ray/core/src/local_scheduler/local_scheduler",
2424
"ray/core/src/local_scheduler/liblocal_scheduler_library.so",
2525
"ray/core/src/global_scheduler/global_scheduler",
26+
"ray/core/src/ray/raylet/raylet_monitor",
2627
"ray/core/src/ray/raylet/raylet",
2728
"ray/WebUI.ipynb"
2829
]

src/ray/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ set(RAY_SRCS
4444
object_manager/object_directory.cc
4545
object_manager/transfer_queue.cc
4646
object_manager/object_manager.cc
47+
raylet/monitor.cc
4748
raylet/mock_gcs_client.cc
4849
raylet/task.cc
4950
raylet/task_execution_spec.cc

src/ray/gcs/client_test.cc

Lines changed: 60 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -781,20 +781,22 @@ void TestClientTableDisconnect(const JobID &job_id,
781781
client->client_table().RegisterClientAddedCallback(
782782
[](gcs::AsyncGcsClient *client, const UniqueID &id, const ClientTableDataT &data) {
783783
ClientTableNotification(client, id, data, true);
784+
// Disconnect from the client table. We should receive a notification
785+
// for the removal of our own entry.
786+
RAY_CHECK_OK(client->client_table().Disconnect());
784787
});
785788
client->client_table().RegisterClientRemovedCallback(
786789
[](gcs::AsyncGcsClient *client, const UniqueID &id, const ClientTableDataT &data) {
787790
ClientTableNotification(client, id, data, false);
788791
test->Stop();
789792
});
790-
// Connect and disconnect to client table. We should receive notifications
791-
// for the addition and removal of our own entry.
793+
// Connect to the client table. We should receive notification for the
794+
// addition of our own entry.
792795
ClientTableDataT local_client_info = client->client_table().GetLocalClient();
793796
local_client_info.node_manager_address = "127.0.0.1";
794797
local_client_info.node_manager_port = 0;
795798
local_client_info.object_manager_port = 0;
796799
RAY_CHECK_OK(client->client_table().Connect(local_client_info));
797-
RAY_CHECK_OK(client->client_table().Disconnect());
798800
test->Start();
799801
}
800802

@@ -803,4 +805,59 @@ TEST_F(TestGcsWithAsio, TestClientTableDisconnect) {
803805
TestClientTableDisconnect(job_id_, client_);
804806
}
805807

808+
void TestClientTableImmediateDisconnect(const JobID &job_id,
809+
std::shared_ptr<gcs::AsyncGcsClient> client) {
810+
// Register callbacks for when a client gets added and removed. The latter
811+
// event will stop the event loop.
812+
client->client_table().RegisterClientAddedCallback(
813+
[](gcs::AsyncGcsClient *client, const UniqueID &id, const ClientTableDataT &data) {
814+
ClientTableNotification(client, id, data, true);
815+
});
816+
client->client_table().RegisterClientRemovedCallback(
817+
[](gcs::AsyncGcsClient *client, const UniqueID &id, const ClientTableDataT &data) {
818+
ClientTableNotification(client, id, data, false);
819+
test->Stop();
820+
});
821+
// Connect to then immediately disconnect from the client table. We should
822+
// receive notifications for the addition and removal of our own entry.
823+
ClientTableDataT local_client_info = client->client_table().GetLocalClient();
824+
local_client_info.node_manager_address = "127.0.0.1";
825+
local_client_info.node_manager_port = 0;
826+
local_client_info.object_manager_port = 0;
827+
RAY_CHECK_OK(client->client_table().Connect(local_client_info));
828+
RAY_CHECK_OK(client->client_table().Disconnect());
829+
test->Start();
830+
}
831+
832+
TEST_F(TestGcsWithAsio, TestClientTableImmediateDisconnect) {
833+
test = this;
834+
TestClientTableImmediateDisconnect(job_id_, client_);
835+
}
836+
837+
void TestClientTableMarkDisconnected(const JobID &job_id,
838+
std::shared_ptr<gcs::AsyncGcsClient> client) {
839+
ClientTableDataT local_client_info = client->client_table().GetLocalClient();
840+
local_client_info.node_manager_address = "127.0.0.1";
841+
local_client_info.node_manager_port = 0;
842+
local_client_info.object_manager_port = 0;
843+
// Connect to the client table to start receiving notifications.
844+
RAY_CHECK_OK(client->client_table().Connect(local_client_info));
845+
// Mark a different client as dead.
846+
ClientID dead_client_id = ClientID::from_random();
847+
RAY_CHECK_OK(client->client_table().MarkDisconnected(dead_client_id));
848+
// Make sure we only get a notification for the removal of the client we
849+
// marked as dead.
850+
client->client_table().RegisterClientRemovedCallback([dead_client_id](
851+
gcs::AsyncGcsClient *client, const UniqueID &id, const ClientTableDataT &data) {
852+
ASSERT_EQ(ClientID::from_binary(data.client_id), dead_client_id);
853+
test->Stop();
854+
});
855+
test->Start();
856+
}
857+
858+
TEST_F(TestGcsWithAsio, TestClientTableMarkDisconnected) {
859+
test = this;
860+
TestClientTableMarkDisconnected(job_id_, client_);
861+
}
862+
806863
} // namespace

src/ray/gcs/tables.cc

Lines changed: 28 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -276,41 +276,40 @@ Status ClientTable::Connect(const ClientTableDataT &local_client) {
276276
RAY_CHECK(local_client.client_id == local_client_.client_id);
277277
local_client_ = local_client;
278278

279+
// Construct the data to add to the client table.
279280
auto data = std::make_shared<ClientTableDataT>(local_client_);
280281
data->is_insertion = true;
281-
// Callback for a notification from the client table.
282-
auto notification_callback = [this](
283-
AsyncGcsClient *client, const UniqueID &log_key,
284-
const std::vector<ClientTableDataT> &notifications) {
285-
RAY_CHECK(log_key == client_log_key_);
286-
for (auto &notification : notifications) {
287-
HandleNotification(client, notification);
288-
}
289-
};
290282
// Callback to handle our own successful connection once we've added
291283
// ourselves.
292284
auto add_callback = [this](AsyncGcsClient *client, const UniqueID &log_key,
293285
std::shared_ptr<ClientTableDataT> data) {
294286
RAY_CHECK(log_key == client_log_key_);
295287
HandleConnected(client, data);
288+
289+
// Callback for a notification from the client table.
290+
auto notification_callback = [this](
291+
AsyncGcsClient *client, const UniqueID &log_key,
292+
const std::vector<ClientTableDataT> &notifications) {
293+
RAY_CHECK(log_key == client_log_key_);
294+
for (auto &notification : notifications) {
295+
HandleNotification(client, notification);
296+
}
297+
};
298+
// Callback to request notifications from the client table once we've
299+
// successfully subscribed.
300+
auto subscription_callback = [this](AsyncGcsClient *c) {
301+
RAY_CHECK_OK(RequestNotifications(JobID::nil(), client_log_key_, client_id_));
302+
};
303+
// Subscribe to the client table.
304+
RAY_CHECK_OK(Subscribe(JobID::nil(), client_id_, notification_callback,
305+
subscription_callback));
296306
};
297-
// Callback to add ourselves once we've successfully subscribed.
298-
auto subscription_callback = [this, data, add_callback](AsyncGcsClient *c) {
299-
// Mark ourselves as deleted if we called Disconnect() since the last
300-
// Connect() call.
301-
if (disconnected_) {
302-
data->is_insertion = false;
303-
}
304-
RAY_CHECK_OK(RequestNotifications(JobID::nil(), client_log_key_, client_id_));
305-
RAY_CHECK_OK(Append(JobID::nil(), client_log_key_, data, add_callback));
306-
};
307-
return Subscribe(JobID::nil(), client_id_, notification_callback,
308-
subscription_callback);
307+
return Append(JobID::nil(), client_log_key_, data, add_callback);
309308
}
310309

311310
Status ClientTable::Disconnect() {
312311
auto data = std::make_shared<ClientTableDataT>(local_client_);
313-
data->is_insertion = true;
312+
data->is_insertion = false;
314313
auto add_callback = [this](AsyncGcsClient *client, const ClientID &id,
315314
std::shared_ptr<ClientTableDataT> data) {
316315
HandleConnected(client, data);
@@ -322,6 +321,13 @@ Status ClientTable::Disconnect() {
322321
return Status::OK();
323322
}
324323

324+
ray::Status ClientTable::MarkDisconnected(const ClientID &dead_client_id) {
325+
auto data = std::make_shared<ClientTableDataT>();
326+
data->client_id = dead_client_id.binary();
327+
data->is_insertion = false;
328+
return Append(JobID::nil(), client_log_key_, data, nullptr);
329+
}
330+
325331
const ClientTableDataT &ClientTable::GetClient(const ClientID &client_id) {
326332
RAY_CHECK(!client_id.is_nil());
327333
auto entry = client_cache_.find(client_id);

src/ray/gcs/tables.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,13 @@ class ClientTable : private Log<UniqueID, ClientTableData> {
435435
/// \return Status
436436
ray::Status Disconnect();
437437

438+
/// Mark a different client as disconnected. The client ID should never be
439+
/// reused for a new client.
440+
///
441+
/// \param dead_client_id The ID of the client to mark as dead.
442+
/// \return Status
443+
ray::Status MarkDisconnected(const ClientID &dead_client_id);
444+
438445
/// Register a callback to call when a new client is added.
439446
///
440447
/// \param callback The callback to register.

src/ray/raylet/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ target_link_libraries(rayletlib ray_static ${Boost_SYSTEM_LIBRARY})
3232
add_executable(raylet main.cc)
3333
target_link_libraries(raylet rayletlib ${Boost_SYSTEM_LIBRARY} pthread)
3434

35+
add_executable(raylet_monitor monitor_main.cc)
36+
target_link_libraries(raylet_monitor rayletlib ${Boost_SYSTEM_LIBRARY} pthread)
37+
3538
install(FILES
3639
raylet
3740
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/ray/raylet")

src/ray/raylet/main.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@ int main(int argc, char *argv[]) {
2929
node_manager_config.worker_command.push_back(token);
3030
}
3131

32-
// TODO(swang): Set this from a global config.
33-
node_manager_config.heartbeat_period_ms = 100;
32+
node_manager_config.heartbeat_period_ms =
33+
RayConfig::instance().heartbeat_timeout_milliseconds();
3434

3535
// Configuration for the object manager.
3636
ray::ObjectManagerConfig object_manager_config;

src/ray/raylet/monitor.cc

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
#include "ray/raylet/monitor.h"
2+
3+
#include "ray/status.h"
4+
5+
namespace ray {
6+
7+
namespace raylet {
8+
9+
/// \class Monitor
10+
///
11+
/// The monitor is responsible for listening for heartbeats from Raylets and
12+
/// deciding when a Raylet has died. If the monitor does not hear from a Raylet
13+
/// within heartbeat_timeout_milliseconds * num_heartbeats_timeout (defined in
14+
/// the Ray configuration), then the monitor will mark that Raylet as dead in
15+
/// the client table, which broadcasts the event to all other Raylets.
16+
Monitor::Monitor(boost::asio::io_service &io_service, const std::string &redis_address,
17+
int redis_port)
18+
: gcs_client_(),
19+
heartbeat_timeout_ms_(RayConfig::instance().num_heartbeats_timeout()),
20+
heartbeat_timer_(io_service) {
21+
RAY_CHECK_OK(gcs_client_.Connect(redis_address, redis_port));
22+
RAY_CHECK_OK(gcs_client_.Attach(io_service));
23+
}
24+
25+
void Monitor::HandleHeartbeat(const ClientID &client_id) {
26+
heartbeats_[client_id] = heartbeat_timeout_ms_;
27+
}
28+
29+
void Monitor::Start() {
30+
const auto heartbeat_callback = [this](gcs::AsyncGcsClient *client, const ClientID &id,
31+
const HeartbeatTableDataT &heartbeat_data) {
32+
HandleHeartbeat(id);
33+
};
34+
RAY_CHECK_OK(gcs_client_.heartbeat_table().Subscribe(UniqueID::nil(), UniqueID::nil(),
35+
heartbeat_callback, nullptr));
36+
Tick();
37+
}
38+
39+
/// A periodic timer that checks for timed out clients.
40+
void Monitor::Tick() {
41+
for (auto it = heartbeats_.begin(); it != heartbeats_.end();) {
42+
it->second--;
43+
if (it->second == 0) {
44+
if (dead_clients_.count(it->first) == 0) {
45+
RAY_LOG(WARNING) << "Client timed out: " << it->first;
46+
RAY_CHECK_OK(gcs_client_.client_table().MarkDisconnected(it->first));
47+
dead_clients_.insert(it->first);
48+
}
49+
it = heartbeats_.erase(it);
50+
} else {
51+
it++;
52+
}
53+
}
54+
55+
auto heartbeat_period = boost::posix_time::milliseconds(
56+
RayConfig::instance().heartbeat_timeout_milliseconds());
57+
heartbeat_timer_.expires_from_now(heartbeat_period);
58+
heartbeat_timer_.async_wait([this](const boost::system::error_code &error) {
59+
RAY_CHECK(!error);
60+
Tick();
61+
});
62+
}
63+
64+
} // namespace raylet
65+
66+
} // namespace ray

src/ray/raylet/monitor.h

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
#ifndef RAY_RAYLET_MONITOR_H
2+
#define RAY_RAYLET_MONITOR_H
3+
4+
#include <memory>
5+
#include <unordered_set>
6+
7+
#include "ray/gcs/client.h"
8+
#include "ray/id.h"
9+
10+
namespace ray {
11+
12+
namespace raylet {
13+
14+
class Monitor {
15+
public:
16+
/// Create a Raylet monitor attached to the given GCS address and port.
17+
///
18+
/// \param io_service The event loop to run the monitor on.
19+
/// \param redis_address The GCS Redis address to connect to.
20+
/// \param redis_port The GCS Redis port to connect to.
21+
Monitor(boost::asio::io_service &io_service, const std::string &redis_address,
22+
int redis_port);
23+
24+
/// Start the monitor. Listen for heartbeats from Raylets and mark Raylets
25+
/// that do not send a heartbeat within a given period as dead.
26+
void Start();
27+
28+
/// A periodic timer that fires on every heartbeat period. Raylets that have
29+
/// not sent a heartbeat within the last num_heartbeats_timeout ticks will be
30+
/// marked as dead in the client table.
31+
void Tick();
32+
33+
/// Handle a heartbeat from a Raylet.
34+
///
35+
/// \param client_id The client ID of the Raylet that sent the heartbeat.
36+
void HandleHeartbeat(const ClientID &client_id);
37+
38+
private:
39+
/// A client to the GCS, through which heartbeats are received.
40+
gcs::AsyncGcsClient gcs_client_;
41+
/// The expected period between heartbeats, for an individual Raylet.
42+
int64_t heartbeat_timeout_ms_;
43+
/// A timer that ticks every heartbeat_timeout_ms_ milliseconds.
44+
boost::asio::deadline_timer heartbeat_timer_;
45+
/// For each Raylet that we receive a heartbeat from, the number of ticks
46+
/// that may pass before the Raylet will be declared dead.
47+
std::unordered_map<ClientID, int64_t, UniqueIDHasher> heartbeats_;
48+
/// The Raylets that have been marked as dead in the client table.
49+
std::unordered_set<ClientID, UniqueIDHasher> dead_clients_;
50+
};
51+
52+
} // namespace raylet
53+
54+
} // namespace ray
55+
56+
#endif // RAY_RAYLET_MONITOR_H

0 commit comments

Comments
 (0)