Skip to content

Commit a581001

Browse files
israbbanielliot-barn
authored andcommitted
[core] (cgroups 17/n) Removing controller check when adding a constraint (#57731)
In #57269, I introduced a bug: * the `cpu` controller is not enabled on the system and user cgroups because it only needs to be enabled on the parent. * however, CgroupDriver::AddConstraint checked that for every constraint, the matching controller was enabled. This was not caught in CI because the python integration tests were not running. For the time being, I've removed the manual tag and excluded `cgroup` from all other python tests. We need to fix this properly because tests shouldn't exclude other tests but rather include the right targets. --------- Signed-off-by: irabbani <israbbani@gmail.com> Signed-off-by: elliot-barn <elliot.barnwell@anyscale.com>
1 parent 739cca1 commit a581001

File tree

9 files changed

+19
-53
lines changed

9 files changed

+19
-53
lines changed

.buildkite/core.rayci.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ steps:
6262
- bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... //python/ray/_common/tests/... //python/ray/dag/... //python/ray/autoscaler/v2/... core
6363
--install-mask all-ray-libraries
6464
--workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 3
65-
--except-tags custom_setup
65+
--except-tags custom_setup,cgroup
6666
--install-mask all-ray-libraries
6767

6868
- label: ":ray: core: python {{matrix.python}} tests ({{matrix.worker_id}})"
@@ -76,7 +76,7 @@ steps:
7676
--install-mask all-ray-libraries
7777
--workers 4 --worker-id "{{matrix.worker_id}}" --parallelism-per-worker 3
7878
--python-version {{matrix.python}}
79-
--except-tags custom_setup
79+
--except-tags custom_setup,cgroup
8080
depends_on: corebuild-multipy
8181
matrix:
8282
setup:
@@ -105,7 +105,7 @@ steps:
105105
--install-mask all-ray-libraries
106106
--workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 3
107107
--test-env=TEST_EXTERNAL_REDIS=1
108-
--except-tags custom_setup
108+
--except-tags custom_setup,cgroup
109109

110110
- label: ":ray: core: memory pressure tests"
111111
tags:
@@ -261,7 +261,7 @@ steps:
261261
--test-env=RAY_MINIMAL=1
262262
--test-env=EXPECTED_PYTHON_VERSION={{matrix}}
263263
--only-tags minimal
264-
--except-tags basic_test,manual
264+
--except-tags basic_test,manual,cgroup
265265
- bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... //python/ray/dashboard/... core
266266
--parallelism-per-worker 3
267267
--python-version {{matrix}}
@@ -366,7 +366,7 @@ steps:
366366
- bazel run //ci/ray_ci:test_in_docker -- //python/ray/... //doc/... core
367367
--install-mask all-ray-libraries
368368
--run-flaky-tests
369-
--except-tags multi_gpu
369+
--except-tags multi_gpu,cgroup
370370
depends_on:
371371
- corebuild
372372

python/ray/tests/resource_isolation/BUILD.bazel

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ py_test(
1616
tags = [
1717
"cgroup",
1818
"exclusive",
19-
"manual",
2019
"team:core",
2120
],
2221
target_compatible_with = [
@@ -33,8 +32,8 @@ py_test(
3332
size = "medium",
3433
srcs = ["test_resource_isolation_config.py"],
3534
tags = [
35+
"cgroup",
3636
"exclusive",
37-
"manual",
3837
"team:core",
3938
],
4039
deps = [

src/ray/common/cgroup2/cgroup_driver_interface.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,6 @@ class CgroupDriverInterface {
172172
supported or the value not correct.
173173
*/
174174
virtual Status AddConstraint(const std::string &cgroup,
175-
const std::string &controller,
176175
const std::string &constraint,
177176
const std::string &value) = 0;
178177
/**

src/ray/common/cgroup2/cgroup_manager.cc

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -173,10 +173,8 @@ void CgroupManager::RegisterRemoveConstraint(const std::string &cgroup,
173173
cleanup_operations_.emplace_back(
174174
[this, constrained_cgroup = cgroup, constraint_to_remove = constraint]() {
175175
std::string default_value = std::to_string(constraint_to_remove.default_value_);
176-
Status s = this->cgroup_driver_->AddConstraint(constrained_cgroup,
177-
constraint_to_remove.controller_,
178-
constraint_to_remove.name_,
179-
default_value);
176+
Status s = this->cgroup_driver_->AddConstraint(
177+
constrained_cgroup, constraint_to_remove.name_, default_value);
180178
if (!s.ok()) {
181179
RAY_LOG(WARNING) << absl::StrFormat(
182180
"Failed to set constraint %s=%s to default value for cgroup %s with error "
@@ -292,22 +290,18 @@ Status CgroupManager::Initialize(int64_t system_reserved_cpu_weight,
292290

293291
RAY_RETURN_NOT_OK(
294292
cgroup_driver_->AddConstraint(system_cgroup_,
295-
cpu_weight_constraint_.controller_,
296293
cpu_weight_constraint_.name_,
297294
std::to_string(system_reserved_cpu_weight)));
298295
RegisterRemoveConstraint(system_cgroup_, cpu_weight_constraint_);
299296

300297
RAY_RETURN_NOT_OK(
301298
cgroup_driver_->AddConstraint(system_cgroup_,
302-
memory_min_constraint_.controller_,
303299
memory_min_constraint_.name_,
304300
std::to_string(system_reserved_memory_bytes)));
305301
RegisterRemoveConstraint(system_cgroup_, memory_min_constraint_);
306302

307-
RAY_RETURN_NOT_OK(cgroup_driver_->AddConstraint(user_cgroup_,
308-
cpu_weight_constraint_.controller_,
309-
cpu_weight_constraint_.name_,
310-
std::to_string(user_cpu_weight)));
303+
RAY_RETURN_NOT_OK(cgroup_driver_->AddConstraint(
304+
user_cgroup_, cpu_weight_constraint_.name_, std::to_string(user_cpu_weight)));
311305
RegisterRemoveConstraint(user_cgroup_, cpu_weight_constraint_);
312306

313307
return Status::OK();

src/ray/common/cgroup2/fake_cgroup_driver.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,6 @@ class FakeCgroupDriver : public CgroupDriverInterface {
194194
}
195195

196196
Status AddConstraint(const std::string &cgroup,
197-
const std::string &controller,
198197
const std::string &constraint,
199198
const std::string &value) override {
200199
if (!add_constraint_s_.ok()) {

src/ray/common/cgroup2/integration_tests/sysfs_cgroup_driver_integration_test.cc

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -574,7 +574,7 @@ TEST_F(SysFsCgroupDriverIntegrationTest, AddResourceConstraintFailsIfCgroupDoesn
574574
std::string non_existent_path =
575575
test_cgroup_path_ + std::filesystem::path::preferred_separator + "nope";
576576
SysFsCgroupDriver driver;
577-
Status s = driver.AddConstraint(non_existent_path, "memory", "memory.min", "1");
577+
Status s = driver.AddConstraint(non_existent_path, "memory.min", "1");
578578
ASSERT_TRUE(s.IsNotFound()) << s.ToString();
579579
}
580580

@@ -584,7 +584,7 @@ TEST_F(SysFsCgroupDriverIntegrationTest,
584584
ASSERT_TRUE(cgroup_or_status.ok()) << cgroup_or_status.ToString();
585585
auto cgroup = std::move(cgroup_or_status.value());
586586
SysFsCgroupDriver driver;
587-
Status s = driver.AddConstraint(cgroup->GetPath(), "memory", "memory.min", "1");
587+
Status s = driver.AddConstraint(cgroup->GetPath(), "memory.min", "1");
588588
ASSERT_TRUE(s.IsPermissionDenied()) << s.ToString();
589589
}
590590

@@ -595,21 +595,10 @@ TEST_F(SysFsCgroupDriverIntegrationTest,
595595
ASSERT_TRUE(cgroup_or_status.ok()) << cgroup_or_status.ToString();
596596
auto cgroup = std::move(cgroup_or_status.value());
597597
SysFsCgroupDriver driver;
598-
Status s = driver.AddConstraint(cgroup->GetPath(), "memory", "memory.min", "1");
598+
Status s = driver.AddConstraint(cgroup->GetPath(), "memory.min", "1");
599599
ASSERT_TRUE(s.IsPermissionDenied()) << s.ToString();
600600
}
601601

602-
TEST_F(SysFsCgroupDriverIntegrationTest,
603-
AddResourceConstraintFailsIfControllerNotEnabled) {
604-
auto cgroup_or_status = TempCgroupDirectory::Create(test_cgroup_path_, S_IRWXU);
605-
ASSERT_TRUE(cgroup_or_status.ok()) << cgroup_or_status.ToString();
606-
auto cgroup = std::move(cgroup_or_status.value());
607-
SysFsCgroupDriver driver;
608-
// Memory controller is not enabled.
609-
Status s = driver.AddConstraint(cgroup->GetPath(), "memory", "memory.min", "1");
610-
ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString();
611-
}
612-
613602
TEST_F(SysFsCgroupDriverIntegrationTest, AddResourceConstraintSucceeds) {
614603
auto cgroup_or_status = TempCgroupDirectory::Create(test_cgroup_path_, S_IRWXU);
615604
ASSERT_TRUE(cgroup_or_status.ok()) << cgroup_or_status.ToString();
@@ -619,7 +608,7 @@ TEST_F(SysFsCgroupDriverIntegrationTest, AddResourceConstraintSucceeds) {
619608
Status enable_controller_s = driver.EnableController(cgroup->GetPath(), "cpu");
620609
ASSERT_TRUE(enable_controller_s.ok()) << enable_controller_s.ToString();
621610
// cpu.weight must be between [1,10000]
622-
Status s = driver.AddConstraint(cgroup->GetPath(), "cpu", "cpu.weight", "500");
611+
Status s = driver.AddConstraint(cgroup->GetPath(), "cpu.weight", "500");
623612
ASSERT_TRUE(s.ok()) << s.ToString();
624613
}
625614

src/ray/common/cgroup2/sysfs_cgroup_driver.cc

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -331,24 +331,12 @@ Status SysFsCgroupDriver::DisableController(const std::string &cgroup_path,
331331
return Status::OK();
332332
}
333333

334+
// What's the right thing here? If the controller is specified?
335+
// The correct API would be specify where the controller should be enabled.
334336
Status SysFsCgroupDriver::AddConstraint(const std::string &cgroup_path,
335-
const std::string &controller,
336337
const std::string &constraint,
337338
const std::string &constraint_value) {
338339
RAY_RETURN_NOT_OK(CheckCgroup(cgroup_path));
339-
// Check if the required controller for the constraint is enabled.
340-
StatusOr<std::unordered_set<std::string>> available_controllers_s =
341-
GetEnabledControllers(cgroup_path);
342-
RAY_RETURN_NOT_OK(available_controllers_s.status());
343-
const auto &controllers = available_controllers_s.value();
344-
if (controllers.find(controller) == controllers.end()) {
345-
return Status::InvalidArgument(absl::StrFormat(
346-
"Failed to apply %s to cgroup %s. To use %s, enable the %s controller.",
347-
constraint,
348-
cgroup_path,
349-
constraint,
350-
controller));
351-
}
352340

353341
// Try to apply the constraint and propagate the appropriate failure error.
354342
std::string file_path =

src/ray/common/cgroup2/sysfs_cgroup_driver.h

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -236,19 +236,17 @@ class SysFsCgroupDriver : public CgroupDriverInterface {
236236
Adds a constraint to the respective cgroup file.
237237
238238
@param cgroup_path absolute path of the cgroup.
239-
@param controller the name of the controller
240239
@param constraint the name of the cgroup file to add the constraint to e.g. cpu.weight
241240
@param constraint_value
242241
243242
@return Status::OK if no errors are encounted.
244243
@return Status::NotFound if the cgroup does not exist.
245244
@return Status::PermissionDenied if current user doesn't have read, write, and execute
246245
permissions.
247-
@return Status::InvalidArgument if the cgroup is not using cgroupv2, controller is not
248-
enabled, or cannot write to the constraint file.
246+
@return Status::InvalidArgument if the cgroup is not using cgroupv2, or cannot write
247+
to the constraint file.
249248
*/
250249
Status AddConstraint(const std::string &cgroup,
251-
const std::string &controller,
252250
const std::string &constraint,
253251
const std::string &constraint_value) override;
254252

src/ray/common/cgroup2/tests/sysfs_cgroup_driver_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ TEST(SysFsCgroupDriver, AddConstraintFailsIfNotCgroupv2Path) {
142142
ASSERT_TRUE(temp_dir_or_status.ok()) << temp_dir_or_status.ToString();
143143
std::unique_ptr<TempDirectory> temp_dir = std::move(temp_dir_or_status.value());
144144
SysFsCgroupDriver driver;
145-
Status s = driver.AddConstraint(temp_dir->GetPath(), "memory", "memory.min", "1");
145+
Status s = driver.AddConstraint(temp_dir->GetPath(), "memory.min", "1");
146146
ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString();
147147
}
148148

0 commit comments

Comments
 (0)