Skip to content

Commit e53cd65

Browse files
committed
something
Signed-off-by: Cuong Nguyen <can@anyscale.com>
1 parent aa6da60 commit e53cd65

21 files changed

+230
-133
lines changed

src/ray/common/metrics.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,9 @@ inline ray::stats::Gauge GetActorMetric() {
2525
/// we use the "Source" required label.
2626
return ray::stats::Gauge{
2727
/*name=*/"actors",
28-
/*description=*/"An actor can be in one of DEPENDENCIES_UNREADY, PENDING_CREATION, ALIVE, "
29-
"ALIVE_IDLE, ALIVE_RUNNING_TASKS, RESTARTING, or DEAD states. ",
28+
/*description=*/
29+
"An actor can be in one of DEPENDENCIES_UNREADY, PENDING_CREATION, ALIVE, "
30+
"ALIVE_IDLE, ALIVE_RUNNING_TASKS, RESTARTING, or DEAD states. "
3031
"An actor is considered ALIVE_IDLE if it is not executing any tasks.",
3132
/*unit=*/"",
3233
// State: the actor state, which is from rpc::ActorTableData::ActorState,

src/ray/core_worker/core_worker.cc

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -231,16 +231,16 @@ void TaskCounter::RecordMetrics() {
231231
} else {
232232
running_tasks = 1.0;
233233
}
234-
ray::stats::STATS_actors.Record(idle,
235-
{{"State", "ALIVE_IDLE"},
236-
{"Name", actor_name_},
237-
{"Source", "executor"},
238-
{"JobId", job_id_}});
239-
ray::stats::STATS_actors.Record(running_tasks,
240-
{{"State", "ALIVE_RUNNING_TASKS"},
241-
{"Name", actor_name_},
242-
{"Source", "executor"},
243-
{"JobId", job_id_}});
234+
actor_by_state_gauge_.Record(idle,
235+
{{"State"sv, "ALIVE_IDLE"},
236+
{"Name"sv, actor_name_},
237+
{"Source"sv, "executor"},
238+
{"JobId"sv, job_id_}});
239+
actor_by_state_gauge_.Record(running_tasks,
240+
{{"State"sv, "ALIVE_RUNNING_TASKS"},
241+
{"Name"sv, actor_name_},
242+
{"Source"sv, "executor"},
243+
{"JobId"sv, job_id_}});
244244
}
245245
}
246246

src/ray/core_worker/tests/core_worker_test.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -291,8 +291,8 @@ class CoreWorkerTest : public ::testing::Test {
291291
pubsub::Publisher *object_info_publisher_;
292292
std::shared_ptr<TaskManager> task_manager_;
293293
std::shared_ptr<CoreWorker> core_worker_;
294-
ray::observability::FakeMetric fake_task_by_state_gauge_;
295-
ray::observability::FakeMetric fake_actor_by_state_gauge_;
294+
ray::observability::FakeGauge fake_task_by_state_gauge_;
295+
ray::observability::FakeGauge fake_actor_by_state_gauge_;
296296
std::unique_ptr<FakePeriodicalRunner> fake_periodical_runner_;
297297

298298
// Controllable time for testing publisher timeouts

src/ray/gcs/gcs_placement_group_manager.cc

Lines changed: 10 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -55,21 +55,6 @@ ExponentialBackoff CreateDefaultBackoff() {
5555
}
5656
} // namespace
5757

58-
GcsPlacementGroupManager::GcsPlacementGroupManager(
59-
instrumented_io_context &io_context,
60-
GcsResourceManager &gcs_resource_manager,
61-
ray::observability::MetricInterface &placement_group_gauge,
62-
ray::observability::MetricInterface &placement_group_creation_latency_in_ms_histogram,
63-
ray::observability::MetricInterface
64-
&placement_group_scheduling_latency_in_ms_histogram)
65-
: io_context_(io_context),
66-
gcs_resource_manager_(gcs_resource_manager),
67-
placement_group_gauge_(placement_group_gauge),
68-
placement_group_creation_latency_in_ms_histogram_(
69-
placement_group_creation_latency_in_ms_histogram),
70-
placement_group_scheduling_latency_in_ms_histogram_(
71-
placement_group_scheduling_latency_in_ms_histogram) {}
72-
7358
GcsPlacementGroupManager::GcsPlacementGroupManager(
7459
instrumented_io_context &io_context,
7560
GcsPlacementGroupSchedulerInterface *scheduler,
@@ -79,7 +64,8 @@ GcsPlacementGroupManager::GcsPlacementGroupManager(
7964
ray::observability::MetricInterface &placement_group_gauge,
8065
ray::observability::MetricInterface &placement_group_creation_latency_in_ms_histogram,
8166
ray::observability::MetricInterface
82-
&placement_group_scheduling_latency_in_ms_histogram)
67+
&placement_group_scheduling_latency_in_ms_histogram,
68+
ray::observability::MetricInterface &placement_group_count_gauge)
8369
: io_context_(io_context),
8470
gcs_placement_group_scheduler_(scheduler),
8571
gcs_table_storage_(gcs_table_storage),
@@ -89,7 +75,8 @@ GcsPlacementGroupManager::GcsPlacementGroupManager(
8975
placement_group_creation_latency_in_ms_histogram_(
9076
placement_group_creation_latency_in_ms_histogram),
9177
placement_group_scheduling_latency_in_ms_histogram_(
92-
placement_group_scheduling_latency_in_ms_histogram) {
78+
placement_group_scheduling_latency_in_ms_histogram),
79+
placement_group_count_gauge_(placement_group_count_gauge) {
9380
placement_group_state_counter_.reset(
9481
new CounterMap<rpc::PlacementGroupTableData::PlacementGroupState>());
9582
placement_group_state_counter_->SetOnChangeCallback(
@@ -992,12 +979,12 @@ std::string GcsPlacementGroupManager::DebugString() const {
992979
}
993980

994981
void GcsPlacementGroupManager::RecordMetrics() const {
995-
placement_group_gauge_.Record(pending_placement_groups_.size(),
996-
{{"State"sv, "Pending"}});
997-
placement_group_gauge_.Record(registered_placement_groups_.size(),
998-
{{"State"sv, "Registered"}});
999-
placement_group_gauge_.Record(infeasible_placement_groups_.size(),
1000-
{{"State"sv, "Infeasible"}});
982+
placement_group_count_gauge_.Record(pending_placement_groups_.size(),
983+
{{"State"sv, "Pending"}});
984+
placement_group_count_gauge_.Record(registered_placement_groups_.size(),
985+
{{"State"sv, "Registered"}});
986+
placement_group_count_gauge_.Record(infeasible_placement_groups_.size(),
987+
{{"State"sv, "Infeasible"}});
1001988
if (usage_stats_client_) {
1002989
usage_stats_client_->RecordExtraUsageCounter(usage::TagKey::PG_NUM_CREATED,
1003990
lifetime_num_placement_groups_created_);

src/ray/gcs/gcs_placement_group_manager.h

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -56,16 +56,18 @@ class GcsPlacementGroupManager : public rpc::PlacementGroupInfoGcsServiceHandler
5656
/// \param gcs_table_storage Used to flush placement group data to storage.
5757
/// \param gcs_resource_manager Reference of GcsResourceManager.
5858
/// \param get_ray_namespace A callback to get the ray namespace.
59-
GcsPlacementGroupManager(instrumented_io_context &io_context,
60-
GcsPlacementGroupSchedulerInterface *scheduler,
61-
gcs::GcsTableStorage *gcs_table_storage,
62-
GcsResourceManager &gcs_resource_manager,
63-
std::function<std::string(const JobID &)> get_ray_namespace,
64-
ray::observability::MetricInterface &placement_group_gauge,
65-
ray::observability::MetricInterface
66-
&placement_group_creation_latency_in_ms_histogram,
67-
ray::observability::MetricInterface
68-
&placement_group_scheduling_latency_in_ms_histogram);
59+
GcsPlacementGroupManager(
60+
instrumented_io_context &io_context,
61+
GcsPlacementGroupSchedulerInterface *scheduler,
62+
gcs::GcsTableStorage *gcs_table_storage,
63+
GcsResourceManager &gcs_resource_manager,
64+
std::function<std::string(const JobID &)> get_ray_namespace,
65+
ray::observability::MetricInterface &placement_group_gauge,
66+
ray::observability::MetricInterface
67+
&placement_group_creation_latency_in_ms_histogram,
68+
ray::observability::MetricInterface
69+
&placement_group_scheduling_latency_in_ms_histogram,
70+
ray::observability::MetricInterface &placement_group_count_gauge);
6971

7072
~GcsPlacementGroupManager() override = default;
7173

@@ -227,7 +229,8 @@ class GcsPlacementGroupManager : public rpc::PlacementGroupInfoGcsServiceHandler
227229
ray::observability::MetricInterface
228230
&placement_group_creation_latency_in_ms_histogram,
229231
ray::observability::MetricInterface
230-
&placement_group_scheduling_latency_in_ms_histogram);
232+
&placement_group_scheduling_latency_in_ms_histogram,
233+
ray::observability::MetricInterface &placement_group_count_gauge);
231234

232235
private:
233236
/// Push a placement group to pending queue.
@@ -359,6 +362,7 @@ class GcsPlacementGroupManager : public rpc::PlacementGroupInfoGcsServiceHandler
359362
ray::observability::MetricInterface &placement_group_creation_latency_in_ms_histogram_;
360363
ray::observability::MetricInterface
361364
&placement_group_scheduling_latency_in_ms_histogram_;
365+
ray::observability::MetricInterface &placement_group_count_gauge_;
362366

363367
FRIEND_TEST(GcsPlacementGroupManagerMockTest, PendingQueuePriorityReschedule);
364368
FRIEND_TEST(GcsPlacementGroupManagerMockTest, PendingQueuePriorityFailed);

src/ray/gcs/gcs_server.cc

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ GcsServer::GcsServer(
7171
ray::observability::MetricInterface &placement_group_creation_latency_in_ms_histogram,
7272
ray::observability::MetricInterface
7373
&placement_group_scheduling_latency_in_ms_histogram,
74+
ray::observability::MetricInterface &placement_group_count_gauge,
7475
ray::observability::MetricInterface &task_events_reported_gauge,
7576
ray::observability::MetricInterface &task_events_dropped_gauge,
7677
ray::observability::MetricInterface &task_events_stored_gauge,
@@ -161,6 +162,7 @@ GcsServer::GcsServer(
161162
placement_group_creation_latency_in_ms_histogram),
162163
placement_group_scheduling_latency_in_ms_histogram_(
163164
placement_group_scheduling_latency_in_ms_histogram),
165+
placement_group_count_gauge_(placement_group_count_gauge),
164166
task_events_reported_gauge_(task_events_reported_gauge),
165167
task_events_dropped_gauge_(task_events_dropped_gauge),
166168
task_events_stored_gauge_(task_events_stored_gauge),
@@ -588,7 +590,8 @@ void GcsServer::InitGcsPlacementGroupManager(const GcsInitData &gcs_init_data) {
588590
},
589591
placement_group_gauge_,
590592
placement_group_creation_latency_in_ms_histogram_,
591-
placement_group_scheduling_latency_in_ms_histogram_);
593+
placement_group_scheduling_latency_in_ms_histogram_,
594+
placement_group_count_gauge_);
592595

593596
gcs_placement_group_manager_->Initialize(gcs_init_data);
594597
rpc_server_.RegisterService(std::make_unique<rpc::PlacementGroupInfoGrpcService>(

src/ray/gcs/gcs_server.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ class GcsServer {
111111
&placement_group_creation_latency_in_ms_histogram,
112112
ray::observability::MetricInterface
113113
&placement_group_scheduling_latency_in_ms_histogram,
114+
ray::observability::MetricInterface &placement_group_count_gauge,
114115
ray::observability::MetricInterface &task_events_reported_gauge,
115116
ray::observability::MetricInterface &task_events_dropped_gauge,
116117
ray::observability::MetricInterface &task_events_stored_gauge,
@@ -334,6 +335,7 @@ class GcsServer {
334335
ray::observability::MetricInterface &placement_group_creation_latency_in_ms_histogram_;
335336
ray::observability::MetricInterface
336337
&placement_group_scheduling_latency_in_ms_histogram_;
338+
ray::observability::MetricInterface &placement_group_count_gauge_;
337339
ray::observability::MetricInterface &task_events_reported_gauge_;
338340
ray::observability::MetricInterface &task_events_dropped_gauge_;
339341
ray::observability::MetricInterface &task_events_stored_gauge_;

src/ray/gcs/gcs_server_main.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ int main(int argc, char *argv[]) {
178178
ray::gcs::GetPlacementGroupCreationLatencyInMsMetric()};
179179
ray::stats::Histogram placement_group_scheduling_latency_in_ms_histogram{
180180
ray::gcs::GetPlacementGroupSchedulingLatencyInMsMetric()};
181+
ray::stats::Gauge placement_group_count_gauge{ray::gcs::GetPlacementGroupCountMetric()};
181182
ray::stats::Gauge task_events_reported_gauge{
182183
ray::gcs::GetTaskManagerTaskEventsReportedMetric()};
183184
ray::stats::Gauge task_events_dropped_gauge{
@@ -201,6 +202,7 @@ int main(int argc, char *argv[]) {
201202
placement_group_gauge,
202203
placement_group_creation_latency_in_ms_histogram,
203204
placement_group_scheduling_latency_in_ms_histogram,
205+
placement_group_count_gauge,
204206
task_events_reported_gauge,
205207
task_events_dropped_gauge,
206208
task_events_stored_gauge,

src/ray/gcs/metrics.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ inline ray::stats::Gauge GetPlacementGroupMetric() {
5252
/*description=*/"Number of placement groups broken down by state.",
5353
/*unit=*/"",
5454
// State: from rpc::PlacementGroupData::PlacementGroupState.
55-
/*tag_keys=*/{"State"},
55+
/*tag_keys=*/{"State", "Source"},
5656
};
5757
}
5858

@@ -76,6 +76,17 @@ inline ray::stats::Histogram GetPlacementGroupSchedulingLatencyInMsMetric() {
7676
};
7777
}
7878

79+
inline ray::stats::Gauge GetPlacementGroupCountMetric() {
80+
return ray::stats::Gauge{
81+
/*name=*/"gcs_placement_group_count",
82+
/*description=*/
83+
"Number of placement groups broken down by state in {Registered, Pending, "
84+
"Infeasible}",
85+
/*unit=*/"",
86+
/*tag_keys=*/{"State"},
87+
};
88+
}
89+
7990
inline ray::stats::Gauge GetTaskManagerTaskEventsReportedMetric() {
8091
return ray::stats::Gauge{
8192
/*name=*/"gcs_task_manager_task_events_reported",

src/ray/gcs/store_client/tests/observable_store_client_test.cc

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,18 +35,36 @@ class ObservableStoreClientTest : public StoreClientTestBase {
3535
void TestMetrics() override {
3636
auto counter_tag_to_value = fake_storage_operation_count_counter_.GetTagToValue();
3737
// 3 operations: Put, Get, Delete
38+
// Get operations include both Get() and GetEmpty() calls, so they're grouped together
3839
ASSERT_EQ(counter_tag_to_value.size(), 3);
40+
41+
// Check each operation type individually
3942
for (const auto &[key, value] : counter_tag_to_value) {
40-
ASSERT_EQ(value, 1);
43+
// Find the operation type
44+
std::string operation_type;
45+
for (const auto &[k, v] : key) {
46+
if (k == "Operation") {
47+
operation_type = v;
48+
break;
49+
}
50+
}
51+
52+
if (operation_type == "Put" || operation_type == "Delete") {
53+
ASSERT_EQ(value, 5000) << "Expected 5000 for " << operation_type << " operation";
54+
} else if (operation_type == "Get") {
55+
ASSERT_EQ(value, 10000) << "Expected 10000 for Get operation (5000 from Get() + "
56+
"5000 from GetEmpty())";
57+
}
4158
}
59+
4260
auto latency_tag_to_value =
4361
fake_storage_operation_latency_in_ms_histogram_.GetTagToValue();
4462
// 3 operations: Put, Get, Delete
4563
ASSERT_EQ(latency_tag_to_value.size(), 3);
4664
}
4765

48-
ray::observability::FakeMetric fake_storage_operation_latency_in_ms_histogram_;
49-
ray::observability::FakeMetric fake_storage_operation_count_counter_;
66+
ray::observability::FakeHistogram fake_storage_operation_latency_in_ms_histogram_;
67+
ray::observability::FakeCounter fake_storage_operation_count_counter_;
5068
};
5169

5270
TEST_F(ObservableStoreClientTest, AsyncPutAndAsyncGetTest) { TestAsyncPutAndAsyncGet(); }

0 commit comments

Comments
 (0)