Skip to content

Commit 39cf6ff

Browse files
atumanovpcmoritz
authored andcommitted
raylet command line resource configuration plumbing (#1882)
* raylet command line resource configuration plumbing * Small changes.
1 parent 85d3963 commit 39cf6ff

File tree

2 files changed

+70
-27
lines changed

2 files changed

+70
-27
lines changed

python/ray/services.py

Lines changed: 56 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -801,6 +801,49 @@ def start_ui(redis_address, stdout_file=None, stderr_file=None, cleanup=True):
801801
return webui_url
802802

803803

804+
def check_and_update_resources(resources):
805+
"""Sanity check a resource dictionary and add sensible defaults.
806+
807+
Args:
808+
resources: A dictionary mapping resource names to resource quantities.
809+
810+
Returns:
811+
A new resource dictionary.
812+
"""
813+
if resources is None:
814+
resources = {}
815+
resources = resources.copy()
816+
if "CPU" not in resources:
817+
# By default, use the number of hardware execution threads for the
818+
# number of cores.
819+
resources["CPU"] = psutil.cpu_count()
820+
821+
# See if CUDA_VISIBLE_DEVICES has already been set.
822+
gpu_ids = ray.utils.get_cuda_visible_devices()
823+
824+
# Check that the number of GPUs that the local scheduler wants doesn't
825+
# excede the amount allowed by CUDA_VISIBLE_DEVICES.
826+
if ("GPU" in resources and gpu_ids is not None
827+
and resources["GPU"] > len(gpu_ids)):
828+
raise Exception("Attempting to start local scheduler with {} GPUs, "
829+
"but CUDA_VISIBLE_DEVICES contains {}.".format(
830+
resources["GPU"], gpu_ids))
831+
832+
if "GPU" not in resources:
833+
# Try to automatically detect the number of GPUs.
834+
resources["GPU"] = _autodetect_num_gpus()
835+
# Don't use more GPUs than allowed by CUDA_VISIBLE_DEVICES.
836+
if gpu_ids is not None:
837+
resources["GPU"] = min(resources["GPU"], len(gpu_ids))
838+
839+
# Check types.
840+
for _, resource_quantity in resources.items():
841+
assert (isinstance(resource_quantity, int)
842+
or isinstance(resource_quantity, float))
843+
844+
return resources
845+
846+
804847
def start_local_scheduler(redis_address,
805848
node_ip_address,
806849
plasma_store_name,
@@ -839,30 +882,7 @@ def start_local_scheduler(redis_address,
839882
Return:
840883
The name of the local scheduler socket.
841884
"""
842-
if resources is None:
843-
resources = {}
844-
if "CPU" not in resources:
845-
# By default, use the number of hardware execution threads for the
846-
# number of cores.
847-
resources["CPU"] = psutil.cpu_count()
848-
849-
# See if CUDA_VISIBLE_DEVICES has already been set.
850-
gpu_ids = ray.utils.get_cuda_visible_devices()
851-
852-
# Check that the number of GPUs that the local scheduler wants doesn't
853-
# excede the amount allowed by CUDA_VISIBLE_DEVICES.
854-
if ("GPU" in resources and gpu_ids is not None
855-
and resources["GPU"] > len(gpu_ids)):
856-
raise Exception("Attempting to start local scheduler with {} GPUs, "
857-
"but CUDA_VISIBLE_DEVICES contains {}.".format(
858-
resources["GPU"], gpu_ids))
859-
860-
if "GPU" not in resources:
861-
# Try to automatically detect the number of GPUs.
862-
resources["GPU"] = _autodetect_num_gpus()
863-
# Don't use more GPUs than allowed by CUDA_VISIBLE_DEVICES.
864-
if gpu_ids is not None:
865-
resources["GPU"] = min(resources["GPU"], len(gpu_ids))
885+
resources = check_and_update_resources(resources)
866886

867887
print("Starting local scheduler with the following resources: {}."
868888
.format(resources))
@@ -889,6 +909,7 @@ def start_raylet(redis_address,
889909
node_ip_address,
890910
plasma_store_name,
891911
worker_path,
912+
resources=None,
892913
stdout_file=None,
893914
stderr_file=None,
894915
cleanup=True):
@@ -913,6 +934,15 @@ def start_raylet(redis_address,
913934
Returns:
914935
The raylet socket name.
915936
"""
937+
static_resources = check_and_update_resources(resources)
938+
939+
# Format the resource argument in a form like 'CPU,1.0,GPU,0,Custom,3'.
940+
resource_argument = ",".join([
941+
"{},{}".format(resource_name, resource_value)
942+
for resource_name, resource_value in zip(static_resources.keys(),
943+
static_resources.values())
944+
])
945+
916946
gcs_ip_address, gcs_port = redis_address.split(":")
917947
raylet_name = "/tmp/raylet{}".format(random_name())
918948

@@ -927,7 +957,7 @@ def start_raylet(redis_address,
927957

928958
command = [
929959
RAYLET_EXECUTABLE, raylet_name, plasma_store_name, node_ip_address,
930-
gcs_ip_address, gcs_port, start_worker_command
960+
gcs_ip_address, gcs_port, start_worker_command, resource_argument
931961
]
932962
pid = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
933963

@@ -1437,6 +1467,7 @@ def start_ray_processes(address_info=None,
14371467
node_ip_address,
14381468
object_store_addresses[i].name,
14391469
worker_path,
1470+
resources=resources[i],
14401471
stdout_file=None,
14411472
stderr_file=None,
14421473
cleanup=cleanup)

src/ray/raylet/main.cc

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,33 @@
55

66
#ifndef RAYLET_TEST
77
int main(int argc, char *argv[]) {
8-
RAY_CHECK(argc == 7);
8+
RAY_CHECK(argc == 8);
99

1010
const std::string raylet_socket_name = std::string(argv[1]);
1111
const std::string store_socket_name = std::string(argv[2]);
1212
const std::string node_ip_address = std::string(argv[3]);
1313
const std::string redis_address = std::string(argv[4]);
1414
int redis_port = std::stoi(argv[5]);
1515
const std::string worker_command = std::string(argv[6]);
16+
const std::string static_resource_list = std::string(argv[7]);
1617

1718
// Configuration for the node manager.
1819
ray::raylet::NodeManagerConfig node_manager_config;
1920
std::unordered_map<std::string, double> static_resource_conf;
20-
static_resource_conf = {{"CPU", 1}, {"GPU", 1}};
21+
// Parse the resource list.
22+
std::istringstream resource_string(static_resource_list);
23+
std::string resource_name;
24+
std::string resource_quantity;
25+
26+
while (std::getline(resource_string, resource_name, ',')) {
27+
RAY_CHECK(std::getline(resource_string, resource_quantity, ','));
28+
// TODO(rkn): The line below could throw an exception. What should we do about this?
29+
static_resource_conf[resource_name] = std::stod(resource_quantity);
30+
}
2131
node_manager_config.resource_config =
2232
ray::raylet::ResourceSet(std::move(static_resource_conf));
33+
RAY_LOG(INFO) << "Starting raylet with static resource configuration: "
34+
<< node_manager_config.resource_config.ToString();
2335
node_manager_config.num_initial_workers = 0;
2436
// Use a default worker that can execute empty tasks with dependencies.
2537

0 commit comments

Comments
 (0)