-
-
Notifications
You must be signed in to change notification settings - Fork 8.4k
[core][distributed] exact ray placement control #12732
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
21 commits
Select commit
Hold shift + click to select a range
63c54c4
add env vars
youkaichao 37b44cf
add examples
youkaichao 84bccc8
fix ray
youkaichao 97826d9
fix example
youkaichao 9820803
print more
youkaichao 49e72d9
add to tests
youkaichao 9a3512f
add more logging
youkaichao ac07519
add more logging
youkaichao faa7dda
add more logging
youkaichao bf85042
add more logging
youkaichao 31ab75c
update tests
youkaichao 86fa368
add comments
youkaichao bb33f83
add decorator
youkaichao 517c162
add comments
youkaichao eab9304
add comments
youkaichao 89a5fd0
Merge branch 'main' into bundle
youkaichao 1cec2f8
rename
youkaichao 98abef3
add asserts
youkaichao 8957091
add asserts
youkaichao 5cf14b7
add comments
youkaichao 91bb146
unify bundle_indices
youkaichao File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
""" | ||
a simple demonstration to show how to control | ||
the placement of the vLLM workers with Ray. | ||
The key is to set VLLM_RAY_PER_WORKER_GPUS and | ||
VLLM_RAY_BUNDLE_INDICES properly. | ||
""" | ||
import os | ||
|
||
import ray | ||
from ray.util.placement_group import placement_group | ||
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy | ||
|
||
from vllm import LLM | ||
from vllm.worker.worker import Worker | ||
|
||
|
||
class MyWorker(Worker): | ||
|
||
def report_device_id(self) -> str: | ||
from vllm.platforms import current_platform | ||
return current_platform.get_device_uuid(self.device.index) | ||
|
||
|
||
class MyLLM(LLM): | ||
|
||
def __init__(self, *args, bundle_indices: list, **kwargs): | ||
# a hack to make the script work. | ||
# stop ray from manipulating CUDA_VISIBLE_DEVICES | ||
# at the top-level | ||
del os.environ["CUDA_VISIBLE_DEVICES"] | ||
# every worker will use 0.4 GPU, so that we can schedule | ||
# 2 instances on the same GPUs. | ||
os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4" | ||
os.environ["VLLM_RAY_BUNDLE_INDICES"] = ",".join( | ||
map(str, bundle_indices)) | ||
print(f"creating LLM with bundle_indices={bundle_indices}") | ||
super().__init__(*args, **kwargs) | ||
|
||
|
||
class RayTrainingActor: | ||
|
||
def report_device_id(self) -> str: | ||
# the argument for get_device_uuid is the index | ||
# of the GPU in the visible devices. | ||
# ray will set CUDA_VISIBLE_DEVICES to the assigned GPUs | ||
from vllm.platforms import current_platform | ||
return current_platform.get_device_uuid(0) | ||
|
||
|
||
# ray manages 4 GPUs | ||
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" | ||
ray.init() | ||
|
||
# we want to co-locate vLLM instance and the training actor | ||
# on the same set of GPUs. | ||
# the placement plan is as follows: | ||
# GPU 0 and 1: training actor 0, 1, and vLLM instance 0 (with TP=2) | ||
# GPU 2 and 3: training actor 2, 3, and vLLM instance 1 (with TP=2) | ||
|
||
pg = placement_group([{"GPU": 1, "CPU": 0}] * 4) | ||
ray.get(pg.ready()) | ||
print(f"placement group has bundles {pg.bundle_specs=}") | ||
|
||
training_actors = [] | ||
training_actor_device_ids = [] | ||
inference_engines = [] | ||
inference_engine_device_ids = [] | ||
|
||
for bundle_index in [0, 1, 2, 3]: | ||
training_actor = ray.remote( | ||
num_cpus=0, | ||
num_gpus=0.4, | ||
scheduling_strategy=PlacementGroupSchedulingStrategy( | ||
placement_group=pg, | ||
placement_group_capture_child_tasks=True, | ||
placement_group_bundle_index=bundle_index, | ||
), | ||
)(RayTrainingActor).remote() | ||
training_actors.append(training_actor) | ||
device_id = ray.get(training_actor.report_device_id.remote()) | ||
print(f"training actor {bundle_index} is on {device_id}") | ||
training_actor_device_ids.append(device_id) | ||
|
||
for (i, bundle_indices) in enumerate([[0, 1], [2, 3]]): | ||
# IMPORTANT: when creating vLLM instances, we need to | ||
# make sure there are no GPU activities on the target GPUs, | ||
# otherwise, they will interfere with the vLLM memory profiling, | ||
# and cause unexpected behaviors. | ||
llm = ray.remote( | ||
num_cpus=0, | ||
num_gpus=0, | ||
scheduling_strategy=PlacementGroupSchedulingStrategy( | ||
placement_group=pg, | ||
placement_group_capture_child_tasks=True, | ||
), | ||
)(MyLLM).remote( | ||
model="facebook/opt-125m", | ||
enforce_eager=True, | ||
worker_cls=MyWorker, | ||
tensor_parallel_size=2, | ||
distributed_executor_backend="ray", | ||
gpu_memory_utilization=0.4, | ||
bundle_indices=bundle_indices, | ||
) | ||
inference_engines.append(llm) | ||
# don't call any method on the inference engine here, | ||
# otherwise it will block until the vLLM instance is created. | ||
|
||
for i, llm in enumerate(inference_engines): | ||
inference_engine_device_ids.append( | ||
ray.get(llm.collective_rpc.remote("report_device_id", args=tuple()))) | ||
print(f"inference engine {i} is on {inference_engine_device_ids[-1]}") | ||
|
||
# check the placement | ||
# the first two training actors should be | ||
# on the same GPUs as the first inference engine | ||
assert training_actor_device_ids[:2] == inference_engine_device_ids[0] | ||
# the last two training actors should be | ||
# on the same GPUs as the second inference engine | ||
assert training_actor_device_ids[2:] == inference_engine_device_ids[1] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.