File tree Expand file tree Collapse file tree 1 file changed +13
-4
lines changed Expand file tree Collapse file tree 1 file changed +13
-4
lines changed Original file line number Diff line number Diff line change 88
99from vllm .platforms import current_platform
1010from vllm .v1 .attention .backends .flash_attn import FlashAttentionBackend
11- from vllm .v1 .attention .backends .flashinfer import FlashInferBackend
12- from vllm .v1 .attention .backends .mla .flashattn_mla import FlashAttnMLABackend
1311from vllm .v1 .kv_offload .mediums import CPULoadStoreSpec , GPULoadStoreSpec
1412from vllm .v1 .kv_offload .worker .cpu_gpu import CpuGpuOffloadingHandler
1513
14+ BACKENDS_TO_TEST = [FlashAttentionBackend ]
15+
16+ if not current_platform .is_rocm ():
17+ from vllm .v1 .attention .backends .flashinfer import FlashInferBackend
18+
19+ BACKENDS_TO_TEST .append (FlashInferBackend )
20+
21+ from vllm .v1 .attention .backends .mla .flashattn_mla import FlashAttnMLABackend
22+
23+ BACKENDS_TO_TEST .append (FlashAttnMLABackend )
24+
1625NUM_GPU_BLOCKS = [64 ]
1726NUM_CPU_BLOCKS = [256 ]
1827GPU_BLOCK_SIZES = [16 ]
@@ -55,8 +64,8 @@ def test_transfer(
5564) -> None :
5665 current_platform .seed_everything (seed )
5766
58- # create per-layer GPU KV caches
59- attn_backends_list = [ FlashAttentionBackend , FlashInferBackend , FlashAttnMLABackend ]
67+ # create per-layer GPU KV caches based on available attn_backends
68+ attn_backends_list = BACKENDS_TO_TEST
6069
6170 gpu_caches = {}
6271 attn_backends = {}
You can’t perform that action at this time.
0 commit comments