vllm-project · jeejeelee · Feb 10, 2025 · Feb 10, 2025 · Feb 10, 2025
@@ -12,7 +12,7 @@
 
 from vllm.utils import direct_register_custom_op
 
-from .utils import get_lora_op_configs
+from .utils import _set_cuda_device, get_lora_op_configs
 
 
 @triton.jit
@@ -142,6 +142,7 @@ def _bgmv_expand(
         META["SPLIT_N"],
         batches,
     )
+    _set_cuda_device(inputs.device)
     _bgmv_expand_kernel[grid](
         inputs,
         lora_b_weights,

@@ -12,7 +12,7 @@
 
 from vllm.utils import direct_register_custom_op
 
-from .utils import get_lora_op_configs
+from .utils import _set_cuda_device, get_lora_op_configs
 
 
 @triton.jit
@@ -158,6 +158,7 @@ def _bgmv_expand_slice(
         META["SPLIT_N"],
         batches,
     )
+    _set_cuda_device(inputs.device)
     _bgmv_expand_slice_kernel[grid](
         inputs,
         lora_b_weights,

@@ -12,7 +12,7 @@
 
 from vllm.utils import direct_register_custom_op
 
-from .utils import get_lora_op_configs
+from .utils import _set_cuda_device, get_lora_op_configs
 
 
 @triton.jit
@@ -124,6 +124,7 @@ def _bgmv_shrink(
         META["SPLIT_K"],
         batches,
     )
+    _set_cuda_device(inputs.device)
     _bgmv_shrink_kernel[grid](
         inputs,
         lora_a_weights,

@@ -14,7 +14,7 @@
 
 from vllm.utils import direct_register_custom_op
 
-from .utils import _get_lora_b_ptr
+from .utils import _get_lora_b_ptr, _set_cuda_device
 
 
 @triton.jit
@@ -218,6 +218,7 @@ def _sgmv_expand(
         batches,
         len(lora_b_weights),
     )
+    _set_cuda_device(inputs.device)
     _sgmv_expand_kernel[grid](
         inputs,
         lora_ptr_tensor,

@@ -14,7 +14,7 @@
 
 from vllm.utils import direct_register_custom_op
 
-from .utils import _get_lora_a_ptr
+from .utils import _get_lora_a_ptr, _set_cuda_device
 
 
 @triton.jit
@@ -184,6 +184,7 @@ def _sgmv_shrink(
         SPLIT_K * len(lora_a_weights),
         batches,
     )
+    _set_cuda_device(inputs.device)
     _sgmv_shrink_kernel[grid](
         inputs,
         lora_ptr_tensor,

@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import functools
+from functools import lru_cache
 from typing import Dict, List, Tuple
 
 import torch
@@ -50,6 +51,14 @@ def get_lora_op_configs(op_type: str, batch: int,
     return config
 
 
+@lru_cache
+def _set_cuda_device(device: torch.device):
+    """
+    Sets the current CUDA device.
+    """
+    torch.cuda.set_device(device)
+
+
 _LORA_A_PTR_DICT: Dict[Tuple[int, ...], Tuple[torch.tensor, ...]] = {}
 _LORA_B_PTR_DICT: Dict[Tuple[int, ...], Tuple[torch.tensor, ...]] = {}