Skip to content

Fix cuda_archs_loose_intersection when handling sm_*a #20207

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -562,7 +562,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"if you intend on running FP8 quantized MoE models on Hopper.")
else()
message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
"in CUDA target architectures")
"in CUDA target architectures.")
endif()
endif()

Expand All @@ -574,7 +574,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
SRCS "${SRCS}"
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
endif()
message(STATUS "Building moe_data for archs: ${CUTLASS_MOE_DATA_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
message(STATUS "Not building moe_data as CUDA Compiler version is "
"not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
"if you intend on running FP8 quantized MoE models on Hopper or Blackwell.")
else()
message(STATUS "Not building moe_data as no compatible archs found "
"in CUDA target architectures.")
endif()
endif()

#
# Machete kernels
Expand Down
33 changes: 14 additions & 19 deletions cmake/utils.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -265,8 +265,8 @@ macro(set_gencode_flags_for_srcs)
endmacro()

#
# For the given `SRC_CUDA_ARCHS` list of gencode versions in the form
# `<major>.<minor>[letter]` compute the "loose intersection" with the
# For the given `SRC_CUDA_ARCHS` list of gencode versions in the form
# `<major>.<minor>[letter]` compute the "loose intersection" with the
# `TGT_CUDA_ARCHS` list of gencodes. We also support the `+PTX` suffix in
# `SRC_CUDA_ARCHS` which indicates that the PTX code should be built when there
# is a CUDA_ARCH in `TGT_CUDA_ARCHS` that is equal to or larger than the
Expand All @@ -278,7 +278,7 @@ endmacro()
# in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`.
# We have special handling for x.0a, if x.0a is in `SRC_CUDA_ARCHS` and x.0 is
# in `TGT_CUDA_ARCHS` then we should remove x.0a from `SRC_CUDA_ARCHS` and add
# x.0a to the result (and remove x.0 from TGT_CUDA_ARCHS).
# x.0a to the result (and remove x.0 from TGT_CUDA_ARCHS).
# The result is stored in `OUT_CUDA_ARCHS`.
#
# Example:
Expand Down Expand Up @@ -313,21 +313,16 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
# if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
# remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS
set(_CUDA_ARCHS)
if ("9.0a" IN_LIST _SRC_CUDA_ARCHS)
list(REMOVE_ITEM _SRC_CUDA_ARCHS "9.0a")
if ("9.0" IN_LIST TGT_CUDA_ARCHS)
list(REMOVE_ITEM _TGT_CUDA_ARCHS "9.0")
set(_CUDA_ARCHS "9.0a")
endif()
endif()

if ("10.0a" IN_LIST _SRC_CUDA_ARCHS)
list(REMOVE_ITEM _SRC_CUDA_ARCHS "10.0a")
if ("10.0" IN_LIST TGT_CUDA_ARCHS)
list(REMOVE_ITEM _TGT_CUDA_ARCHS "10.0")
set(_CUDA_ARCHS "10.0a")
foreach(_arch ${_SRC_CUDA_ARCHS})
if(_arch MATCHES "\\a$")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shall we explicitly handle 9.0a or 10.0a? If we have something new like 11.0a, do we expect to have this logic to handle them here?

Copy link
Contributor Author

@huydhn huydhn Jun 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The cmake output looks correct when I throw in 12.0a, printing-- Building moe_data for archs: 9.0a;10.0a;12.0a. I couldn't find a reason on why 9.0a and 10.0a needs to be handled explicitly here, and a similar cmake loop logic has already been implemented to handle +PTX suffix. So, having a loop to handle all *a cases looks like the right choice. Let's wait for more context from folks I guess.

Copy link
Collaborator

@LucasWilkinson LucasWilkinson Jun 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the loop is the right solution 👍 this function was originally written when the only "a" variant was 9.0a and we were naively hopefully the the lack of forward compatibility was a temporary one time hopper thing; looks like this is the direction Nvidia is going though 😞 so we should just embrace it haha. I think the 10.0a was just kinda tacked on so good catch!

list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
string(REPLACE "a" "" _base "${_arch}")
if ("${_base}" IN_LIST TGT_CUDA_ARCHS)
list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}")
list(APPEND _CUDA_ARCHS "${_arch}")
endif()
endif()
endif()
endforeach()

list(SORT _SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)

Expand Down Expand Up @@ -359,7 +354,7 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
endforeach()

list(REMOVE_DUPLICATES _CUDA_ARCHS)

# reapply +PTX suffix to architectures that requested PTX
set(_FINAL_ARCHS)
foreach(_arch ${_CUDA_ARCHS})
Expand All @@ -370,7 +365,7 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
endif()
endforeach()
set(_CUDA_ARCHS ${_FINAL_ARCHS})

set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)
endfunction()

Expand Down