Skip to content

Commit 311d80a

Browse files
DarkLight1337Alvant
authored andcommitted
[CI/Build] Avoid CUDA initialization (vllm-project#8534)
Signed-off-by: Alvant <alvasian@yandex.ru>
1 parent 21f7556 commit 311d80a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+256
-256
lines changed

benchmarks/kernels/benchmark_layernorm.py

+3-6
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
import random
21
import time
32

43
import torch
54

65
from vllm.model_executor.layers.layernorm import RMSNorm
7-
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
6+
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
7+
seed_everything)
88

99

1010
@torch.inference_mode()
@@ -16,10 +16,7 @@ def main(num_tokens: int,
1616
do_profile: bool = False,
1717
num_warmup_iters: int = 5,
1818
num_iters: int = 100) -> None:
19-
random.seed(seed)
20-
torch.random.manual_seed(seed)
21-
if torch.cuda.is_available():
22-
torch.cuda.manual_seed(seed)
19+
seed_everything(seed)
2320
torch.set_default_device("cuda")
2421

2522
layer = RMSNorm(hidden_size).to(dtype=dtype)

benchmarks/kernels/benchmark_moe.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from transformers import AutoConfig
1111

1212
from vllm.model_executor.layers.fused_moe.fused_moe import *
13-
from vllm.utils import FlexibleArgumentParser
13+
from vllm.utils import FlexibleArgumentParser, seed_everything
1414

1515

1616
class BenchmarkConfig(TypedDict):
@@ -166,7 +166,7 @@ class BenchmarkWorker:
166166

167167
def __init__(self, seed: int) -> None:
168168
torch.set_default_device("cuda")
169-
torch.cuda.manual_seed_all(seed)
169+
seed_everything(seed)
170170
self.seed = seed
171171

172172
def benchmark(
@@ -180,7 +180,7 @@ def benchmark(
180180
use_fp8_w8a8: bool,
181181
use_int8_w8a16: bool,
182182
) -> Tuple[Dict[str, int], float]:
183-
torch.cuda.manual_seed_all(self.seed)
183+
seed_everything(self.seed)
184184
dtype_str = get_config_dtype_str(dtype,
185185
use_int8_w8a16=use_int8_w8a16,
186186
use_fp8_w8a8=use_fp8_w8a8)

benchmarks/kernels/benchmark_paged_attention.py

+2-5
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from vllm import _custom_ops as ops
88
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
9-
create_kv_caches_with_random)
9+
create_kv_caches_with_random, seed_everything)
1010

1111
NUM_BLOCKS = 1024
1212
PARTITION_SIZE = 512
@@ -28,10 +28,7 @@ def main(
2828
device: str = "cuda",
2929
kv_cache_dtype: Optional[str] = None,
3030
) -> None:
31-
random.seed(seed)
32-
torch.random.manual_seed(seed)
33-
if torch.cuda.is_available():
34-
torch.cuda.manual_seed(seed)
31+
seed_everything(seed)
3532

3633
scale = float(1.0 / (head_size**0.5))
3734
query = torch.empty(num_seqs,

benchmarks/kernels/benchmark_quant.py

+3-6
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
import random
21
import time
32

43
import torch
54

65
from vllm import _custom_ops as ops
7-
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
6+
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
7+
seed_everything)
88

99

1010
@torch.inference_mode()
@@ -17,10 +17,7 @@ def main(num_tokens: int,
1717
do_profile: bool = False,
1818
num_warmup_iters: int = 5,
1919
num_iters: int = 100) -> None:
20-
random.seed(seed)
21-
torch.random.manual_seed(seed)
22-
if torch.cuda.is_available():
23-
torch.cuda.manual_seed(seed)
20+
seed_everything(seed)
2421
torch.set_default_device("cuda")
2522

2623
x = torch.randn(num_tokens, hidden_size, dtype=dtype)

benchmarks/kernels/benchmark_rope.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
88
get_rope)
9-
from vllm.utils import FlexibleArgumentParser
9+
from vllm.utils import FlexibleArgumentParser, seed_everything
1010

1111

1212
def benchmark_rope_kernels_multi_lora(
@@ -22,9 +22,7 @@ def benchmark_rope_kernels_multi_lora(
2222
max_position: int = 8192,
2323
base: int = 10000,
2424
) -> None:
25-
torch.random.manual_seed(seed)
26-
if torch.cuda.is_available():
27-
torch.cuda.manual_seed(seed)
25+
seed_everything(seed)
2826
torch.set_default_device(device)
2927
if rotary_dim is None:
3028
rotary_dim = head_size

tests/kernels/test_activation.py

+3-6
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul,
88
NewGELU, QuickGELU,
99
SiluAndMul)
10+
from vllm.utils import seed_everything
1011

1112
from .allclose_default import get_default_atol, get_default_rtol
1213

@@ -34,9 +35,7 @@ def test_act_and_mul(
3435
seed: int,
3536
device: str,
3637
) -> None:
37-
torch.random.manual_seed(seed)
38-
if torch.cuda.is_available():
39-
torch.cuda.manual_seed(seed)
38+
seed_everything(seed)
4039
torch.set_default_device(device)
4140
x = torch.randn(num_tokens, 2 * d, dtype=dtype)
4241
if activation == "silu":
@@ -77,9 +76,7 @@ def test_activation(
7776
seed: int,
7877
device: str,
7978
) -> None:
80-
torch.random.manual_seed(seed)
81-
if torch.cuda.is_available():
82-
torch.cuda.manual_seed(seed)
79+
seed_everything(seed)
8380
torch.set_default_device(device)
8481
x = torch.randn(num_tokens, d, dtype=dtype)
8582
layer = activation[0]()

tests/kernels/test_attention.py

+5-13
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from tests.kernels.utils import opcheck
88
from vllm import _custom_ops as ops
9-
from vllm.utils import get_max_shared_memory_bytes, is_hip
9+
from vllm.utils import get_max_shared_memory_bytes, is_hip, seed_everything
1010

1111
from .allclose_default import get_default_atol, get_default_rtol
1212

@@ -139,10 +139,8 @@ def test_paged_attention(
139139
) -> None:
140140
if kv_cache_dtype == "fp8" and head_size % 16:
141141
pytest.skip()
142-
random.seed(seed)
143-
torch.random.manual_seed(seed)
144-
if torch.cuda.is_available():
145-
torch.cuda.manual_seed(seed)
142+
143+
seed_everything(seed)
146144
torch.set_default_device(device)
147145
scale = float(1.0 / (head_size**0.5))
148146
num_query_heads, num_kv_heads = num_heads
@@ -354,10 +352,7 @@ def test_paged_attention_rocm(
354352
seed: int,
355353
device: str,
356354
) -> None:
357-
random.seed(seed)
358-
torch.random.manual_seed(seed)
359-
if torch.cuda.is_available():
360-
torch.cuda.manual_seed(seed)
355+
seed_everything(seed)
361356
torch.set_default_device(device)
362357
scale = float(1.0 / (head_size**0.5))
363358
num_query_heads, num_kv_heads = num_heads
@@ -506,10 +501,7 @@ def test_multi_query_kv_attention(
506501
seed: int,
507502
device: str,
508503
) -> None:
509-
random.seed(seed)
510-
torch.random.manual_seed(seed)
511-
if torch.cuda.is_available():
512-
torch.cuda.manual_seed(seed)
504+
seed_everything(seed)
513505
torch.set_default_device(device)
514506
# MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
515507
# As the xformers library is already tested with its own tests, we can use

tests/kernels/test_attention_selector.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def test_flash_attn(monkeypatch):
4545
override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL)
4646

4747
# Unsupported CUDA arch
48-
with patch("torch.cuda.get_device_capability", return_value=[7, 5]):
48+
with patch("torch.cuda.get_device_capability", return_value=(7, 5)):
4949
backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 16)
5050
assert backend.name != STR_FLASH_ATTN_VAL
5151

tests/kernels/test_awq_triton.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
from vllm.model_executor.layers.quantization.awq_triton import (
99
AWQ_TRITON_SUPPORTED_GROUP_SIZES, awq_dequantize_triton, awq_gemm_triton)
10+
from vllm.utils import seed_everything
1011

1112
device = "cuda"
1213

@@ -79,7 +80,7 @@ def test_dequantize(qweight_rows, qweight_cols, group_size):
7980
zeros_cols = qweight_cols
8081
zeros_dtype = torch.int32
8182

82-
torch.manual_seed(0)
83+
seed_everything(0)
8384

8485
qweight = torch.randint(0,
8586
torch.iinfo(torch.int32).max,
@@ -133,7 +134,7 @@ def test_gemm(N, K, M, splitK, group_size):
133134
qzeros_rows = scales_rows
134135
qzeros_cols = qweight_cols
135136

136-
torch.manual_seed(0)
137+
seed_everything(0)
137138

138139
input = torch.rand((input_rows, input_cols),
139140
dtype=input_dtype,

tests/kernels/test_blocksparse_attention.py

+3-9
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from vllm import _custom_ops as ops
88
from vllm.attention.ops.blocksparse_attention.interface import (
99
LocalStridedBlockSparseAttn)
10-
from vllm.utils import get_max_shared_memory_bytes, is_hip
10+
from vllm.utils import get_max_shared_memory_bytes, is_hip, seed_everything
1111

1212
from .allclose_default import get_default_atol, get_default_rtol
1313

@@ -172,10 +172,7 @@ def test_paged_attention(
172172
blocksparse_block_size: int,
173173
blocksparse_head_sliding_step: int,
174174
) -> None:
175-
random.seed(seed)
176-
torch.random.manual_seed(seed)
177-
if torch.cuda.is_available():
178-
torch.cuda.manual_seed(seed)
175+
seed_everything(seed)
179176
torch.set_default_device(device)
180177
scale = float(1.0 / (head_size**0.5))
181178
num_query_heads, num_kv_heads = num_heads
@@ -386,10 +383,7 @@ def test_varlen_blocksparse_attention_prefill(
386383
seed: int,
387384
device: str,
388385
) -> None:
389-
random.seed(seed)
390-
torch.random.manual_seed(seed)
391-
if torch.cuda.is_available():
392-
torch.cuda.manual_seed(seed)
386+
seed_everything(seed)
393387
torch.set_default_device(device)
394388
# MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
395389
# As the xformers library is already tested with its own tests, we can use

tests/kernels/test_cache.py

+7-18
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
88
from vllm import _custom_ops as ops
9+
from vllm.utils import seed_everything
910

1011
COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
1112
DTYPES = [torch.half, torch.bfloat16, torch.float]
@@ -55,10 +56,7 @@ def test_copy_blocks(
5556
) -> None:
5657
if kv_cache_dtype == "fp8" and head_size % 16:
5758
pytest.skip()
58-
random.seed(seed)
59-
torch.random.manual_seed(seed)
60-
if torch.cuda.is_available():
61-
torch.cuda.manual_seed(seed)
59+
seed_everything(seed)
6260
torch.set_default_device(device)
6361
# Generate random block mappings where each source block is mapped to two
6462
# destination blocks.
@@ -134,10 +132,7 @@ def test_reshape_and_cache(
134132
) -> None:
135133
if kv_cache_dtype == "fp8" and head_size % 16:
136134
pytest.skip()
137-
random.seed(seed)
138-
torch.random.manual_seed(seed)
139-
if torch.cuda.is_available():
140-
torch.cuda.manual_seed(seed)
135+
seed_everything(seed)
141136
torch.set_default_device(device)
142137
# Create a random slot mapping.
143138
num_slots = block_size * num_blocks
@@ -229,9 +224,7 @@ def test_reshape_and_cache_flash(
229224
device: str,
230225
kv_cache_dtype: str,
231226
) -> None:
232-
random.seed(seed)
233-
torch.random.manual_seed(seed)
234-
torch.cuda.manual_seed(seed)
227+
seed_everything(seed)
235228
torch.set_default_device(device)
236229

237230
# Create a random slot mapping.
@@ -345,10 +338,8 @@ def test_swap_blocks(
345338
pytest.skip()
346339
if kv_cache_dtype == "fp8" and head_size % 16:
347340
pytest.skip()
348-
random.seed(seed)
349-
torch.random.manual_seed(seed)
350-
if torch.cuda.is_available():
351-
torch.cuda.manual_seed(seed)
341+
342+
seed_everything(seed)
352343

353344
src_device = device if direction[0] == "cuda" else 'cpu'
354345
dst_device = device if direction[1] == "cuda" else 'cpu'
@@ -417,9 +408,7 @@ def test_fp8_e4m3_conversion(
417408
seed: int,
418409
device: str,
419410
) -> None:
420-
random.seed(seed)
421-
torch.random.manual_seed(seed)
422-
torch.cuda.manual_seed(seed)
411+
seed_everything(seed)
423412

424413
low = -224.0
425414
high = 224.0

tests/kernels/test_causal_conv1d.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
99
causal_conv1d_fn, causal_conv1d_update)
10+
from vllm.utils import seed_everything
1011

1112

1213
def causal_conv1d_ref(
@@ -104,7 +105,7 @@ def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation,
104105
if itype == torch.bfloat16:
105106
rtol, atol = 1e-2, 5e-2
106107
# set seed
107-
torch.random.manual_seed(0)
108+
seed_everything(0)
108109
if not channel_last:
109110
x = torch.randn(batch,
110111
4096 + dim + 64,
@@ -175,7 +176,7 @@ def test_causal_conv1d_update(batch, dim, width, has_bias, silu_activation,
175176
if itype == torch.bfloat16:
176177
rtol, atol = 1e-2, 5e-2
177178
# set seed
178-
torch.random.manual_seed(0)
179+
seed_everything(0)
179180
batch = 2
180181
x = torch.randn(batch, dim, device=device, dtype=itype)
181182
conv_state = torch.randn(batch, dim, width, device=device, dtype=itype)

0 commit comments

Comments
 (0)