Skip to content

Commit 47a230c

Browse files
LucasWilkinsonAlvant
authored andcommitted
[CI/Build] Per file CUDA Archs (improve wheel size and dev build times) (vllm-project#8845)
Signed-off-by: Alvant <alvasian@yandex.ru>
1 parent 5b8f494 commit 47a230c

22 files changed

+828
-370
lines changed

CMakeLists.txt

+169-55
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,19 @@ else()
143143
message(FATAL_ERROR "Can't find CUDA or HIP installation.")
144144
endif()
145145

146+
147+
#
148+
# For cuda we want to be able to control which architectures we compile for on
149+
# a per-file basis in order to cut down on compile time. So here we extract
150+
# the set of architectures we want to compile for and remove the from the
151+
# CMAKE_CUDA_FLAGS so that they are not applied globally.
152+
#
153+
if(VLLM_GPU_LANG STREQUAL "CUDA")
154+
clear_cuda_arches(CUDA_ARCH_FLAGS)
155+
extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
156+
message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
157+
endif()
158+
146159
#
147160
# Override the GPU architectures detected by cmake/torch and filter them by
148161
# the supported versions for the current language.
@@ -223,78 +236,162 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
223236
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
224237
"csrc/quantization/aqlm/gemm_kernels.cu"
225238
"csrc/quantization/awq/gemm_kernels.cu"
226-
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
227-
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
228-
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
229-
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
230-
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
231-
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
232239
"csrc/quantization/gguf/gguf_kernel.cu"
233-
"csrc/quantization/fp8/fp8_marlin.cu"
234240
"csrc/custom_all_reduce.cu"
235241
"csrc/permute_cols.cu"
236-
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
237-
"csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
238-
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
242+
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu")
243+
244+
set_gencode_flags_for_srcs(
245+
SRCS "${VLLM_EXT_SRC}"
246+
CUDA_ARCHS "${CUDA_ARCHS}")
247+
248+
# Only build Marlin kernels if we are building for at least some compatible archs.
249+
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
250+
# are not supported by Machete yet.
251+
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS})
252+
if (MARLIN_ARCHS)
253+
set(MARLIN_SRCS
254+
"csrc/quantization/fp8/fp8_marlin.cu"
255+
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
256+
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
257+
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
258+
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
259+
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
260+
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
261+
set_gencode_flags_for_srcs(
262+
SRCS "${MARLIN_SRCS}"
263+
CUDA_ARCHS "${MARLIN_ARCHS}")
264+
list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
265+
message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
266+
else()
267+
message(STATUS "Not building Marlin kernels as no compatible archs found"
268+
"in CUDA target architectures")
269+
endif()
270+
271+
#
272+
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
273+
# CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
274+
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
275+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
276+
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
277+
set_gencode_flags_for_srcs(
278+
SRCS "${SRCS}"
279+
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
280+
list(APPEND VLLM_EXT_SRC "${SRCS}")
281+
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
282+
message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
283+
else()
284+
# clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
285+
# build any 3x kernels
286+
set(SCALED_MM_3X_ARCHS)
287+
288+
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
289+
message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
290+
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
291+
"later if you intend on running FP8 quantized models on "
292+
"Hopper.")
293+
else()
294+
message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
295+
"in CUDA target architectures")
296+
endif()
297+
endif()
239298

240299
#
241-
# The CUTLASS kernels for Hopper require sm90a to be enabled.
242-
# This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
243-
# That adds an extra 17MB to compiled binary, so instead we selectively enable it.
244-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
245-
set_source_files_properties(
246-
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
247-
PROPERTIES
248-
COMPILE_FLAGS
249-
"-gencode arch=compute_90a,code=sm_90a")
300+
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
301+
# kernels for the remaining archs that are not already built for 3x.
302+
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
303+
"7.5;8.0;8.6;8.9;9.0;9.0a" "${CUDA_ARCHS}")
304+
# subtract out the archs that are already built for 3x
305+
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
306+
if (SCALED_MM_2X_ARCHS)
307+
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
308+
set_gencode_flags_for_srcs(
309+
SRCS "${SRCS}"
310+
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
311+
list(APPEND VLLM_EXT_SRC "${SRCS}")
312+
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C2X=1")
313+
message(STATUS "Building scaled_mm_c2x for archs: ${SCALED_MM_2X_ARCHS}")
314+
else()
315+
if (SCALED_MM_3X_ARCHS)
316+
message(STATUS "Not building scaled_mm_c2x as all archs are already built"
317+
" for and covered by scaled_mm_c3x")
318+
else()
319+
message(STATUS "Not building scaled_mm_c2x as no compatible archs found "
320+
"in CUDA target architectures")
321+
endif()
250322
endif()
251323

252324

253325
#
254326
# Machete kernels
255327

256328
# The machete kernels only work on hopper and require CUDA 12.0 or later.
257-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
329+
# Only build Machete kernels if we are building for something compatible with sm90a
330+
cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
331+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
258332
#
259333
# For the Machete kernels we automatically generate sources for various
260334
# preselected input type pairs and schedules.
261335
# Generate sources:
262-
execute_process(
263-
COMMAND ${CMAKE_COMMAND} -E env
264-
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
265-
${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py
266-
RESULT_VARIABLE machete_generation_result
267-
OUTPUT_VARIABLE machete_generation_output
268-
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
269-
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
270-
)
271-
272-
if (NOT machete_generation_result EQUAL 0)
273-
message(FATAL_ERROR "Machete generation failed."
274-
" Result: \"${machete_generation_result}\""
275-
"\nCheck the log for details: "
276-
"${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
336+
set(MACHETE_GEN_SCRIPT
337+
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py)
338+
file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)
339+
340+
message(STATUS "Machete generation script hash: ${MACHETE_GEN_SCRIPT_HASH}")
341+
message(STATUS "Last run machete generate script hash: $CACHE{MACHETE_GEN_SCRIPT_HASH}")
342+
343+
if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH}
344+
OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
345+
execute_process(
346+
COMMAND ${CMAKE_COMMAND} -E env
347+
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
348+
${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
349+
RESULT_VARIABLE machete_generation_result
350+
OUTPUT_VARIABLE machete_generation_output
351+
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
352+
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
353+
)
354+
355+
if (NOT machete_generation_result EQUAL 0)
356+
message(FATAL_ERROR "Machete generation failed."
357+
" Result: \"${machete_generation_result}\""
358+
"\nCheck the log for details: "
359+
"${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
360+
else()
361+
set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
362+
CACHE STRING "Last run machete generate script hash" FORCE)
363+
message(STATUS "Machete generation completed successfully.")
364+
endif()
277365
else()
278-
message(STATUS "Machete generation completed successfully.")
366+
message(STATUS "Machete generation script has not changed, skipping generation.")
279367
endif()
280368

281369
# Add machete generated sources
282370
file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu")
283371
list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES})
284-
message(STATUS "Machete generated sources: ${MACHETE_GEN_SOURCES}")
285372

286-
set_source_files_properties(
287-
${MACHETE_GEN_SOURCES}
288-
PROPERTIES
289-
COMPILE_FLAGS
290-
"-gencode arch=compute_90a,code=sm_90a")
373+
# forward compatible
374+
set_gencode_flags_for_srcs(
375+
SRCS "${MACHETE_GEN_SOURCES}"
376+
CUDA_ARCHS "${MACHETE_ARCHS}")
377+
378+
list(APPEND VLLM_EXT_SRC
379+
csrc/quantization/machete/machete_pytorch.cu)
380+
381+
message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
382+
else()
383+
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
384+
AND MACHETE_ARCHS)
385+
message(STATUS "Not building Machete kernels as CUDA Compiler version is "
386+
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
387+
"later if you intend on running w4a16 quantized models on "
388+
"Hopper.")
389+
else()
390+
message(STATUS "Not building Machete kernels as no compatible archs "
391+
"found in CUDA target architectures")
392+
endif()
291393
endif()
292-
293-
# Add pytorch binding for machete (add on even CUDA < 12.0 so that we can
294-
# raise an error if the user that this was built with an incompatible
295-
# CUDA version)
296-
list(APPEND VLLM_EXT_SRC
297-
csrc/quantization/machete/machete_pytorch.cu)
394+
# if CUDA endif
298395
endif()
299396

300397
message(STATUS "Enabling C extension.")
@@ -323,14 +420,31 @@ set(VLLM_MOE_EXT_SRC
323420
"csrc/moe/torch_bindings.cpp"
324421
"csrc/moe/topk_softmax_kernels.cu")
325422

423+
set_gencode_flags_for_srcs(
424+
SRCS "${VLLM_MOE_EXT_SRC}"
425+
CUDA_ARCHS "${CUDA_ARCHS}")
426+
326427
if(VLLM_GPU_LANG STREQUAL "CUDA")
327-
list(APPEND VLLM_MOE_EXT_SRC
328-
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
329-
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
330-
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
331-
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
332-
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
333-
"csrc/moe/marlin_moe_ops.cu")
428+
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
429+
if (MARLIN_MOE_ARCHS)
430+
set(MARLIN_MOE_SRC
431+
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
432+
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
433+
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
434+
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
435+
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
436+
"csrc/moe/marlin_moe_ops.cu")
437+
438+
set_gencode_flags_for_srcs(
439+
SRCS "${MARLIN_MOE_SRC}"
440+
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
441+
442+
list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
443+
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
444+
else()
445+
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
446+
"in CUDA target architectures")
447+
endif()
334448
endif()
335449

336450
message(STATUS "Enabling moe extension.")

0 commit comments

Comments
 (0)