@@ -333,36 +333,64 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
333
333
" in CUDA target architectures, or CUDA not >= 12.0" )
334
334
endif ()
335
335
336
+
337
+ set (SCALED_MM_3X_ARCHS )
336
338
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
337
- # CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
338
- cuda_archs_loose_intersection (SCALED_MM_3X_ARCHS "9.0a;10.0a;10.1a;12.0a " "${CUDA_ARCHS} " )
339
- if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS )
339
+ # CUDA 12.0 or later
340
+ cuda_archs_loose_intersection (SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS} " )
341
+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS )
340
342
set (SRCS
341
- "csrc/quantization/cutlass_w8a8/scaled_mm_c3x .cu"
343
+ "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90 .cu"
342
344
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
343
345
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
344
346
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
345
347
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu" )
346
348
set_gencode_flags_for_srcs (
347
349
SRCS "${SRCS} "
348
- CUDA_ARCHS "${SCALED_MM_3X_ARCHS } " )
350
+ CUDA_ARCHS "${SCALED_MM_ARCHS } " )
349
351
list (APPEND VLLM_EXT_SRC "${SRCS} " )
350
- list (APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1" )
351
- message (STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS} " )
352
+ list (APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM90=1" )
353
+ # Let scaled_mm_c2x know it doesn't need to build these arches
354
+ list (APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS} " )
355
+ message (STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS} " )
352
356
else ()
353
- if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS )
354
- message (STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
357
+ if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS )
358
+ message (STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
355
359
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
356
360
"later if you intend on running FP8 quantized models on "
357
361
"Hopper." )
358
362
else ()
359
- message (STATUS "Not building scaled_mm_c3x as no compatible archs found "
363
+ message (STATUS "Not building scaled_mm_c3x_sm90 as no compatible archs found "
360
364
"in CUDA target architectures" )
361
365
endif ()
366
+ endif ()
362
367
363
- # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
364
- # build any 3x kernels
365
- set (SCALED_MM_3X_ARCHS )
368
+ # The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
369
+ # CUDA 12.8 or later
370
+ cuda_archs_loose_intersection (SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS} " )
371
+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS )
372
+ set (SRCS
373
+ "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
374
+ "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
375
+ )
376
+ set_gencode_flags_for_srcs (
377
+ SRCS "${SRCS} "
378
+ CUDA_ARCHS "${SCALED_MM_ARCHS} " )
379
+ list (APPEND VLLM_EXT_SRC "${SRCS} " )
380
+ list (APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM100=1" )
381
+ # Let scaled_mm_c2x know it doesn't need to build these arches
382
+ list (APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS} " )
383
+ message (STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS} " )
384
+ else ()
385
+ if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS )
386
+ message (STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
387
+ "not >= 12.8, we recommend upgrading to CUDA 12.8 or "
388
+ "later if you intend on running FP8 quantized models on "
389
+ "Blackwell." )
390
+ else ()
391
+ message (STATUS "Not building scaled_mm_c3x_100 as no compatible archs found "
392
+ "in CUDA target architectures" )
393
+ endif ()
366
394
endif ()
367
395
368
396
#
@@ -395,16 +423,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
395
423
396
424
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
397
425
# require CUDA 12.2 or later (and only work on Hopper and Blackwell).
398
- if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS )
426
+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS )
399
427
set (SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu" )
400
428
set_gencode_flags_for_srcs (
401
429
SRCS "${SRCS} "
402
- CUDA_ARCHS "${SCALED_MM_3X_ARCHS } " )
430
+ CUDA_ARCHS "${SCALED_MM_ARCHS } " )
403
431
list (APPEND VLLM_EXT_SRC "${SRCS} " )
404
432
list (APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1" )
405
- message (STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS } " )
433
+ message (STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS } " )
406
434
else ()
407
- if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS )
435
+ if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS )
408
436
message (STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
409
437
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
410
438
"if you intend on running FP8 sparse quantized models on Hopper." )
@@ -432,22 +460,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
432
460
set (FP4_ARCHS )
433
461
endif ()
434
462
435
- # FP8 Blackwell Archs
436
- cuda_archs_loose_intersection (BLACKWELL_ARCHS "10.0;10.1;12.0" "${CUDA_ARCHS} " )
437
- if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND BLACKWELL_ARCHS )
438
- set (SRCS
439
- "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
440
- )
441
- set_gencode_flags_for_srcs (
442
- SRCS "${SRCS} "
443
- CUDA_ARCHS "${BLACKWELL_ARCHS} " )
444
- list (APPEND VLLM_EXT_SRC "${SRCS} " )
445
- message (STATUS "Building FP8 for archs: ${BLACKWELL_ARCHS} " )
446
- else ()
447
- # clear BLACKWELL_ARCHS
448
- set (BLACKWELL_ARCHS )
449
- endif ()
450
-
451
463
#
452
464
# Machete kernels
453
465
0 commit comments