@@ -143,6 +143,19 @@ else()
143
143
message (FATAL_ERROR "Can't find CUDA or HIP installation." )
144
144
endif ()
145
145
146
+
147
+ #
148
+ # For cuda we want to be able to control which architectures we compile for on
149
+ # a per-file basis in order to cut down on compile time. So here we extract
150
+ # the set of architectures we want to compile for and remove the from the
151
+ # CMAKE_CUDA_FLAGS so that they are not applied globally.
152
+ #
153
+ if (VLLM_GPU_LANG STREQUAL "CUDA" )
154
+ clear_cuda_arches(CUDA_ARCH_FLAGS)
155
+ extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS} " )
156
+ message (STATUS "CUDA target architectures: ${CUDA_ARCHS} " )
157
+ endif ()
158
+
146
159
#
147
160
# Override the GPU architectures detected by cmake/torch and filter them by
148
161
# the supported versions for the current language.
@@ -223,78 +236,162 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
223
236
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
224
237
"csrc/quantization/aqlm/gemm_kernels.cu"
225
238
"csrc/quantization/awq/gemm_kernels.cu"
226
- "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
227
- "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
228
- "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
229
- "csrc/quantization/gptq_marlin/gptq_marlin.cu"
230
- "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
231
- "csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
232
239
"csrc/quantization/gguf/gguf_kernel.cu"
233
- "csrc/quantization/fp8/fp8_marlin.cu"
234
240
"csrc/custom_all_reduce.cu"
235
241
"csrc/permute_cols.cu"
236
- "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
237
- "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
238
- "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu" )
242
+ "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu" )
243
+
244
+ set_gencode_flags_for_srcs(
245
+ SRCS "${VLLM_EXT_SRC} "
246
+ CUDA_ARCHS "${CUDA_ARCHS} " )
247
+
248
+ # Only build Marlin kernels if we are building for at least some compatible archs.
249
+ # Keep building Marlin for 9.0 as there are some group sizes and shapes that
250
+ # are not supported by Machete yet.
251
+ cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS} )
252
+ if (MARLIN_ARCHS)
253
+ set (MARLIN_SRCS
254
+ "csrc/quantization/fp8/fp8_marlin.cu"
255
+ "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
256
+ "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
257
+ "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
258
+ "csrc/quantization/gptq_marlin/gptq_marlin.cu"
259
+ "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
260
+ "csrc/quantization/gptq_marlin/awq_marlin_repack.cu" )
261
+ set_gencode_flags_for_srcs(
262
+ SRCS "${MARLIN_SRCS} "
263
+ CUDA_ARCHS "${MARLIN_ARCHS} " )
264
+ list (APPEND VLLM_EXT_SRC "${MARLIN_SRCS} " )
265
+ message (STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS} " )
266
+ else ()
267
+ message (STATUS "Not building Marlin kernels as no compatible archs found"
268
+ "in CUDA target architectures" )
269
+ endif ()
270
+
271
+ #
272
+ # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
273
+ # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
274
+ cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS} " )
275
+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
276
+ set (SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu" )
277
+ set_gencode_flags_for_srcs(
278
+ SRCS "${SRCS} "
279
+ CUDA_ARCHS "${SCALED_MM_3X_ARCHS} " )
280
+ list (APPEND VLLM_EXT_SRC "${SRCS} " )
281
+ list (APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1" )
282
+ message (STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS} " )
283
+ else ()
284
+ # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
285
+ # build any 3x kernels
286
+ set (SCALED_MM_3X_ARCHS)
287
+
288
+ if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
289
+ message (STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
290
+ "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
291
+ "later if you intend on running FP8 quantized models on "
292
+ "Hopper." )
293
+ else ()
294
+ message (STATUS "Not building scaled_mm_c3x as no compatible archs found "
295
+ "in CUDA target architectures" )
296
+ endif ()
297
+ endif ()
239
298
240
299
#
241
- # The CUTLASS kernels for Hopper require sm90a to be enabled.
242
- # This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
243
- # That adds an extra 17MB to compiled binary, so instead we selectively enable it.
244
- if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
245
- set_source_files_properties (
246
- "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
247
- PROPERTIES
248
- COMPILE_FLAGS
249
- "-gencode arch=compute_90a,code=sm_90a" )
300
+ # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
301
+ # kernels for the remaining archs that are not already built for 3x.
302
+ cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
303
+ "7.5;8.0;8.6;8.9;9.0;9.0a" "${CUDA_ARCHS} " )
304
+ # subtract out the archs that are already built for 3x
305
+ list (REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS} )
306
+ if (SCALED_MM_2X_ARCHS)
307
+ set (SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu" )
308
+ set_gencode_flags_for_srcs(
309
+ SRCS "${SRCS} "
310
+ CUDA_ARCHS "${SCALED_MM_2X_ARCHS} " )
311
+ list (APPEND VLLM_EXT_SRC "${SRCS} " )
312
+ list (APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C2X=1" )
313
+ message (STATUS "Building scaled_mm_c2x for archs: ${SCALED_MM_2X_ARCHS} " )
314
+ else ()
315
+ if (SCALED_MM_3X_ARCHS)
316
+ message (STATUS "Not building scaled_mm_c2x as all archs are already built"
317
+ " for and covered by scaled_mm_c3x" )
318
+ else ()
319
+ message (STATUS "Not building scaled_mm_c2x as no compatible archs found "
320
+ "in CUDA target architectures" )
321
+ endif ()
250
322
endif ()
251
323
252
324
253
325
#
254
326
# Machete kernels
255
327
256
328
# The machete kernels only work on hopper and require CUDA 12.0 or later.
257
- if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
329
+ # Only build Machete kernels if we are building for something compatible with sm90a
330
+ cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS} " )
331
+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
258
332
#
259
333
# For the Machete kernels we automatically generate sources for various
260
334
# preselected input type pairs and schedules.
261
335
# Generate sources:
262
- execute_process (
263
- COMMAND ${CMAKE_COMMAND} -E env
264
- PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR} /csrc/cutlass_extensions/:${CUTLASS_DIR} /python/:${VLLM_PYTHON_PATH} :$PYTHONPATH
265
- ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR} /csrc/quantization/machete/generate.py
266
- RESULT_VARIABLE machete_generation_result
267
- OUTPUT_VARIABLE machete_generation_output
268
- OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR} /machete_generation.log
269
- ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR} /machete_generation.log
270
- )
271
-
272
- if (NOT machete_generation_result EQUAL 0)
273
- message (FATAL_ERROR "Machete generation failed."
274
- " Result: \" ${machete_generation_result} \" "
275
- "\n Check the log for details: "
276
- "${CMAKE_CURRENT_BINARY_DIR} /machete_generation.log" )
336
+ set (MACHETE_GEN_SCRIPT
337
+ ${CMAKE_CURRENT_SOURCE_DIR} /csrc/quantization/machete/generate.py)
338
+ file (MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)
339
+
340
+ message (STATUS "Machete generation script hash: ${MACHETE_GEN_SCRIPT_HASH} " )
341
+ message (STATUS "Last run machete generate script hash: $CACHE{MACHETE_GEN_SCRIPT_HASH} " )
342
+
343
+ if (NOT DEFINED CACHE {MACHETE_GEN_SCRIPT_HASH}
344
+ OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH} )
345
+ execute_process (
346
+ COMMAND ${CMAKE_COMMAND} -E env
347
+ PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR} /csrc/cutlass_extensions/:${CUTLASS_DIR} /python/:${VLLM_PYTHON_PATH} :$PYTHONPATH
348
+ ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
349
+ RESULT_VARIABLE machete_generation_result
350
+ OUTPUT_VARIABLE machete_generation_output
351
+ OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR} /machete_generation.log
352
+ ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR} /machete_generation.log
353
+ )
354
+
355
+ if (NOT machete_generation_result EQUAL 0)
356
+ message (FATAL_ERROR "Machete generation failed."
357
+ " Result: \" ${machete_generation_result} \" "
358
+ "\n Check the log for details: "
359
+ "${CMAKE_CURRENT_BINARY_DIR} /machete_generation.log" )
360
+ else ()
361
+ set (MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
362
+ CACHE STRING "Last run machete generate script hash" FORCE)
363
+ message (STATUS "Machete generation completed successfully." )
364
+ endif ()
277
365
else ()
278
- message (STATUS "Machete generation completed successfully ." )
366
+ message (STATUS "Machete generation script has not changed, skipping generation ." )
279
367
endif ()
280
368
281
369
# Add machete generated sources
282
370
file (GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu" )
283
371
list (APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES} )
284
- message (STATUS "Machete generated sources: ${MACHETE_GEN_SOURCES} " )
285
372
286
- set_source_files_properties (
287
- ${MACHETE_GEN_SOURCES}
288
- PROPERTIES
289
- COMPILE_FLAGS
290
- "-gencode arch=compute_90a,code=sm_90a" )
373
+ # forward compatible
374
+ set_gencode_flags_for_srcs(
375
+ SRCS "${MACHETE_GEN_SOURCES} "
376
+ CUDA_ARCHS "${MACHETE_ARCHS} " )
377
+
378
+ list (APPEND VLLM_EXT_SRC
379
+ csrc/quantization/machete/machete_pytorch.cu)
380
+
381
+ message (STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS} " )
382
+ else ()
383
+ if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
384
+ AND MACHETE_ARCHS)
385
+ message (STATUS "Not building Machete kernels as CUDA Compiler version is "
386
+ "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
387
+ "later if you intend on running w4a16 quantized models on "
388
+ "Hopper." )
389
+ else ()
390
+ message (STATUS "Not building Machete kernels as no compatible archs "
391
+ "found in CUDA target architectures" )
392
+ endif ()
291
393
endif ()
292
-
293
- # Add pytorch binding for machete (add on even CUDA < 12.0 so that we can
294
- # raise an error if the user that this was built with an incompatible
295
- # CUDA version)
296
- list (APPEND VLLM_EXT_SRC
297
- csrc/quantization/machete/machete_pytorch.cu)
394
+ # if CUDA endif
298
395
endif ()
299
396
300
397
message (STATUS "Enabling C extension." )
@@ -323,14 +420,31 @@ set(VLLM_MOE_EXT_SRC
323
420
"csrc/moe/torch_bindings.cpp"
324
421
"csrc/moe/topk_softmax_kernels.cu" )
325
422
423
+ set_gencode_flags_for_srcs(
424
+ SRCS "${VLLM_MOE_EXT_SRC} "
425
+ CUDA_ARCHS "${CUDA_ARCHS} " )
426
+
326
427
if (VLLM_GPU_LANG STREQUAL "CUDA" )
327
- list (APPEND VLLM_MOE_EXT_SRC
328
- "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
329
- "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
330
- "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
331
- "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
332
- "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
333
- "csrc/moe/marlin_moe_ops.cu" )
428
+ cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.9;9.0" "${CUDA_ARCHS} " )
429
+ if (MARLIN_MOE_ARCHS)
430
+ set (MARLIN_MOE_SRC
431
+ "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
432
+ "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
433
+ "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
434
+ "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
435
+ "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
436
+ "csrc/moe/marlin_moe_ops.cu" )
437
+
438
+ set_gencode_flags_for_srcs(
439
+ SRCS "${MARLIN_MOE_SRC} "
440
+ CUDA_ARCHS "${MARLIN_MOE_ARCHS} " )
441
+
442
+ list (APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC} " )
443
+ message (STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS} " )
444
+ else ()
445
+ message (STATUS "Not building Marlin MOE kernels as no compatible archs found"
446
+ "in CUDA target architectures" )
447
+ endif ()
334
448
endif ()
335
449
336
450
message (STATUS "Enabling moe extension." )
0 commit comments