Skip to content

Commit 55a1d10

Browse files
committed
Revert "Revert "[core] (cgroups 6/n) CgroupManager cleans up the entire cgroup hierarchy in reverse order when destroyed. (ray-project#56260)""
This reverts commit c4c774e.
1 parent fecf786 commit 55a1d10

File tree

18 files changed

+493
-159
lines changed

18 files changed

+493
-159
lines changed

python/ray/_private/worker.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2440,7 +2440,6 @@ def is_initialized() -> bool:
24402440
return ray._private.worker.global_worker.connected
24412441

24422442

2443-
# TODO(hjiang): Add cgroup path along with [enable_resource_isolation].
24442443
@with_connect_or_shutdown_lock
24452444
def connect(
24462445
node,
@@ -2459,7 +2458,6 @@ def connect(
24592458
worker_launch_time_ms: int = -1,
24602459
worker_launched_time_ms: int = -1,
24612460
debug_source: str = "",
2462-
enable_resource_isolation: bool = False,
24632461
):
24642462
"""Connect this worker to the raylet, to Plasma, and to GCS.
24652463
@@ -2488,7 +2486,6 @@ def connect(
24882486
finshes launching. If the worker is not launched by raylet (e.g.,
24892487
driver), this must be -1 (default value).
24902488
debug_source: Source information for `CoreWorker`, used for debugging and informational purpose, rather than functional purpose.
2491-
enable_resource_isolation: If true, core worker enables resource isolation by adding itself into appropriate cgroup.
24922489
"""
24932490
# Do some basic checking to make sure we didn't call ray.init twice.
24942491
error_message = "Perhaps you called ray.init twice by accident?"
@@ -2667,7 +2664,6 @@ def connect(
26672664
worker_launch_time_ms,
26682665
worker_launched_time_ms,
26692666
debug_source,
2670-
enable_resource_isolation,
26712667
)
26722668

26732669
if mode == SCRIPT_MODE:

python/ray/_private/workers/default_worker.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,6 @@
271271
ray_debugger_external=args.ray_debugger_external,
272272
worker_launch_time_ms=args.worker_launch_time_ms,
273273
worker_launched_time_ms=worker_launched_time_ms,
274-
enable_resource_isolation=args.enable_resource_isolation,
275274
)
276275

277276
worker = ray._private.worker.global_worker

python/ray/_raylet.pyx

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3005,7 +3005,7 @@ cdef class CoreWorker:
30053005
local_mode, driver_name,
30063006
serialized_job_config, metrics_agent_port, runtime_env_hash,
30073007
startup_token, session_name, cluster_id, entrypoint,
3008-
worker_launch_time_ms, worker_launched_time_ms, debug_source, enable_resource_isolation):
3008+
worker_launch_time_ms, worker_launched_time_ms, debug_source):
30093009
self.is_local_mode = local_mode
30103010

30113011
cdef CCoreWorkerOptions options = CCoreWorkerOptions()
@@ -3061,7 +3061,6 @@ cdef class CoreWorker:
30613061
options.worker_launch_time_ms = worker_launch_time_ms
30623062
options.worker_launched_time_ms = worker_launched_time_ms
30633063
options.debug_source = debug_source
3064-
options.enable_resource_isolation = enable_resource_isolation
30653064
CCoreWorkerProcess.Initialize(options)
30663065

30673066
self.cgname_to_eventloop_dict = None

python/ray/includes/libcoreworker.pxd

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -440,7 +440,6 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
440440
int64_t worker_launch_time_ms
441441
int64_t worker_launched_time_ms
442442
c_string debug_source
443-
c_bool enable_resource_isolation
444443

445444
cdef cppclass CCoreWorkerProcess "ray::core::CoreWorkerProcess":
446445
@staticmethod

src/ray/common/cgroup2/BUILD.bazel

Lines changed: 41 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,40 @@
11
load("//bazel:ray.bzl", "ray_cc_library")
22

3+
config_setting(
4+
name = "is_linux",
5+
constraint_values = ["@platforms//os:linux"],
6+
)
7+
8+
# Public targets.
9+
ray_cc_library(
10+
name = "cgroup_manager",
11+
srcs = select({
12+
":is_linux": ["cgroup_manager.cc"],
13+
"//conditions:default": ["noop_cgroup_manager.cc"],
14+
}),
15+
hdrs = ["cgroup_manager.h"],
16+
visibility = ["//visibility:public"],
17+
deps = [
18+
":cgroup_driver_interface",
19+
":cgroup_manager_interface",
20+
"//src/ray/common:status",
21+
"//src/ray/common:status_or",
22+
] + select({
23+
":is_linux": [
24+
":scoped_cgroup_operation",
25+
"//src/ray/util:logging",
26+
"@com_google_absl//absl/strings",
27+
],
28+
"//conditions:default": [],
29+
}),
30+
)
31+
332
ray_cc_library(
433
name = "cgroup_driver_interface",
534
hdrs = [
635
"cgroup_driver_interface.h",
736
],
8-
target_compatible_with = [
9-
"@platforms//os:linux",
10-
],
37+
visibility = ["//visibility:public"],
1138
deps = [
1239
"//src/ray/common:status",
1340
"//src/ray/common:status_or",
@@ -19,51 +46,42 @@ ray_cc_library(
1946
hdrs = [
2047
"cgroup_manager_interface.h",
2148
],
22-
target_compatible_with = [
23-
"@platforms//os:linux",
24-
],
49+
visibility = ["//visibility:public"],
2550
deps = [
2651
"//src/ray/common:status",
2752
"//src/ray/common:status_or",
2853
],
2954
)
3055

3156
ray_cc_library(
32-
name = "cgroup_manager",
33-
srcs = ["cgroup_manager.cc"],
57+
name = "sysfs_cgroup_driver",
58+
srcs = ["sysfs_cgroup_driver.cc"],
3459
hdrs = [
35-
"cgroup_manager.h",
36-
"scoped_cgroup_operation.h",
60+
"sysfs_cgroup_driver.h",
3761
],
3862
target_compatible_with = [
3963
"@platforms//os:linux",
4064
],
65+
visibility = ["//visibility:public"],
4166
deps = [
4267
":cgroup_driver_interface",
43-
":cgroup_manager_interface",
4468
"//src/ray/common:status",
4569
"//src/ray/common:status_or",
4670
"//src/ray/util:logging",
4771
"@com_google_absl//absl/strings",
4872
],
4973
)
5074

75+
# Private Targets.
5176
ray_cc_library(
52-
name = "sysfs_cgroup_driver",
53-
srcs = ["sysfs_cgroup_driver.cc"],
77+
name = "scoped_cgroup_operation",
5478
hdrs = [
55-
"sysfs_cgroup_driver.h",
79+
"scoped_cgroup_operation.h",
5680
],
5781
target_compatible_with = [
5882
"@platforms//os:linux",
5983
],
60-
deps = [
61-
":cgroup_driver_interface",
62-
"//src/ray/common:status",
63-
"//src/ray/common:status_or",
64-
"//src/ray/util:logging",
65-
"@com_google_absl//absl/strings",
66-
],
84+
visibility = [":__subpackages__"],
6785
)
6886

6987
ray_cc_library(
@@ -74,6 +92,7 @@ ray_cc_library(
7492
target_compatible_with = [
7593
"@platforms//os:linux",
7694
],
95+
visibility = [":__subpackages__"],
7796
deps = [
7897
":cgroup_driver_interface",
7998
"//src/ray/common:status",
@@ -87,6 +106,7 @@ ray_cc_library(
87106
target_compatible_with = [
88107
"@platforms//os:linux",
89108
],
109+
visibility = [":__subpackages__"],
90110
deps = [
91111
"//src/ray/common:id",
92112
"//src/ray/common:status",

src/ray/common/cgroup2/cgroup_manager.cc

Lines changed: 46 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -113,48 +113,65 @@ StatusOr<std::unique_ptr<CgroupManager>> CgroupManager::Create(
113113
return cgroup_manager;
114114
}
115115

116-
// TODO(#54703): This is a placeholder for cleanup. This will call
117-
// CgroupDriver::DeleteCgroup.
118116
void CgroupManager::RegisterDeleteCgroup(const std::string &cgroup_path) {
119-
cleanup_operations_.emplace_back([cgroup = cgroup_path]() {
120-
RAY_LOG(INFO) << absl::StrFormat("Deleting all cgroup %s.", cgroup);
117+
cleanup_operations_.emplace_back([this, cgroup = cgroup_path]() {
118+
Status s = this->cgroup_driver_->DeleteCgroup(cgroup);
119+
if (!s.ok()) {
120+
RAY_LOG(WARNING) << absl::StrFormat(
121+
"Failed to delete cgroup %s with error %s.", cgroup, s.ToString());
122+
}
121123
});
122124
}
123125

124-
// TODO(#54703): This is a placeholder for cleanup. This will call
125-
// CgroupDriver::MoveAllProcesses.
126126
void CgroupManager::RegisterMoveAllProcesses(const std::string &from,
127127
const std::string &to) {
128-
cleanup_operations_.emplace_back([from_cgroup = from, to_cgroup = to]() {
129-
RAY_LOG(INFO) << absl::StrFormat(
130-
"Moved All Processes from %s to %s.", from_cgroup, to_cgroup);
128+
cleanup_operations_.emplace_back([this, from_cgroup = from, to_cgroup = to]() {
129+
Status s = this->cgroup_driver_->MoveAllProcesses(from_cgroup, to_cgroup);
130+
if (!s.ok()) {
131+
RAY_LOG(WARNING) << absl::StrFormat(
132+
"Failed to move all processes from %s to %s with error %s",
133+
from_cgroup,
134+
to_cgroup,
135+
s.ToString());
136+
}
131137
});
132138
}
133139

134-
// TODO(#54703): This is a placeholder for cleanup. This will call
135-
// CgroupDriver::AddConstraint(cgroup, constraint, default_value).
136140
template <typename T>
137141
void CgroupManager::RegisterRemoveConstraint(const std::string &cgroup,
138142
const Constraint<T> &constraint) {
139143
cleanup_operations_.emplace_back(
140-
[constrained_cgroup = cgroup, constraint_to_remove = constraint]() {
141-
RAY_LOG(INFO) << absl::StrFormat(
142-
"Setting constraint %s to default value %lld for cgroup %s",
143-
constraint_to_remove.name_,
144-
constraint_to_remove.default_value_,
145-
constrained_cgroup);
144+
[this, constrained_cgroup = cgroup, constraint_to_remove = constraint]() {
145+
std::string default_value = std::to_string(constraint_to_remove.default_value_);
146+
Status s = this->cgroup_driver_->AddConstraint(constrained_cgroup,
147+
constraint_to_remove.controller_,
148+
constraint_to_remove.name_,
149+
default_value);
150+
if (!s.ok()) {
151+
RAY_LOG(WARNING) << absl::StrFormat(
152+
"Failed to set constraint %s=%s to default value for cgroup %s with error "
153+
"%s.",
154+
constraint_to_remove.name_,
155+
default_value,
156+
constrained_cgroup,
157+
s.ToString());
158+
}
146159
});
147160
}
148161

149-
// TODO(#54703): This is a placeholder for cleanup. This will call
150-
// CgroupDriver::DisableController.
151-
void CgroupManager::RegisterDisableController(const std::string &cgroup,
162+
void CgroupManager::RegisterDisableController(const std::string &cgroup_path,
152163
const std::string &controller) {
153-
cleanup_operations_.emplace_back([cgroup_to_clean = cgroup,
154-
controller_to_disable = controller]() {
155-
RAY_LOG(INFO) << absl::StrFormat(
156-
"Disabling controller %s for cgroup %s.", controller_to_disable, cgroup_to_clean);
157-
});
164+
cleanup_operations_.emplace_back(
165+
[this, cgroup = cgroup_path, controller_to_disable = controller]() {
166+
Status s = this->cgroup_driver_->DisableController(cgroup, controller_to_disable);
167+
if (!s.ok()) {
168+
RAY_LOG(WARNING) << absl::StrFormat(
169+
"Failed to disable controller %s for cgroup %s with error %s",
170+
controller_to_disable,
171+
cgroup,
172+
s.ToString());
173+
}
174+
});
158175
}
159176

160177
Status CgroupManager::Initialize(int64_t system_reserved_cpu_weight,
@@ -168,11 +185,11 @@ Status CgroupManager::Initialize(int64_t system_reserved_cpu_weight,
168185
cpu_weight_constraint_.Max() - system_reserved_cpu_weight;
169186

170187
RAY_LOG(INFO) << absl::StrFormat(
171-
"Initializing CgroupManager at base cgroup path at %s. Ray's cgroup "
172-
"hierarchy will under the node cgroup %s. The %s controllers will be "
188+
"Initializing CgroupManager at base cgroup at '%s'. Ray's cgroup "
189+
"hierarchy will under the node cgroup at '%s'. The %s controllers will be "
173190
"enabled. "
174-
"System cgroup %s will have constraints [%s=%lld, %s=%lld]. "
175-
"Application cgroup %s will have constraints [%s=%lld].",
191+
"The system cgroup at '%s' will have constraints [%s=%lld, %s=%lld]. "
192+
"The application cgroup '%s' will have constraints [%s=%lld].",
176193
base_cgroup_path_,
177194
node_cgroup_path_,
178195
supported_controllers,

src/ray/common/cgroup2/cgroup_manager.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,10 @@ class CgroupManager : public CgroupManagerInterface {
111111
Status Initialize(const int64_t system_reserved_cpu_weight,
112112
const int64_t system_reserved_memory_bytes);
113113

114-
// TODO(#54703): This is a placeholder for cleanup. This will be implemented in the a
115-
// future PR.
116-
void RegisterDeleteCgroup(const std::string &cgroup_path);
114+
// The Register* methods register a callback that will execute in the destructor
115+
// in FILO order. All callbacks required the cgroup_driver_ to be available to
116+
// remove the cgroup hierarchy.
117+
void RegisterDeleteCgroup(const std::string &cgroup);
117118
void RegisterMoveAllProcesses(const std::string &from, const std::string &to);
118119
template <typename T>
119120
void RegisterRemoveConstraint(const std::string &cgroup,

0 commit comments

Comments
 (0)