flashinfer-ai · yyihuang · Jul 14, 2025 · Jul 14, 2025 · Jul 14, 2025 · Jul 14, 2025
diff --git a/flashinfer/comm/trtllm_ar.py b/flashinfer/comm/trtllm_ar.py
@@ -96,14 +96,15 @@ class FP4QuantizationSFLayout:
 
 
 def gen_trtllm_comm_module() -> JitSpec:
+    major, minor = torch.cuda.get_device_capability()
     return gen_jit_spec(
         "trtllm_comm",
         [
             jit_env.FLASHINFER_CSRC_DIR / "trtllm_allreduce.cu",
             jit_env.FLASHINFER_CSRC_DIR / "trtllm_allreduce_fusion.cu",
             jit_env.FLASHINFER_CSRC_DIR / "trtllm_moe_allreduce_fusion.cu",
         ],
-        extra_cuda_cflags=sm100a_nvcc_flags,
+        extra_cuda_cflags=sm100a_nvcc_flags if major >= 10 and minor >= 0 else [],
     )
 
 

diff --git a/include/flashinfer/comm/trtllm_allreduce_fusion.cuh b/include/flashinfer/comm/trtllm_allreduce_fusion.cuh
@@ -1,6 +1,7 @@
 #include <cooperative_groups.h>
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
+#include <cuda_fp4.h>
 
 #include <cuda/std/optional>
 #include <tuple>
@@ -522,7 +523,17 @@ __device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(std::optional<int> batchI
   return nullptr;
 }
 
+__forceinline__ __device__ uint32_t pack_bytes(uint8_t c0, uint8_t c1, uint8_t c2, uint8_t c3) {
+  uint32_t val0 = c0;
+  uint32_t val1 = c1;
+  uint32_t val2 = c2;
+  uint32_t val3 = c3;
+
+  return (val3 << 24) | (val2 << 16) | (val1 << 8) | val0;
+}
+
 // Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
+// NOTE: bypass sm_100 requirement by __nv_cvt_float2_to_fp4x2
 inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
   uint32_t val;
@@ -543,8 +554,14 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
         "f"(array[6]), "f"(array[7]));
   return val;
 #else
-  // static_assert(false, "not supported.");
-  return 0;
+  uint32_t val;
+  __nv_fp4x2_storage_t vals[4];
+#pragma unroll
+  for (int i = 0; i < 4; i++) {
+    vals[i] = __nv_cvt_float2_to_fp4x2(*(((float2*)array) + i), __NV_E2M1, cudaRoundNearest);
+  }
+  val = pack_bytes(vals[0], vals[1], vals[2], vals[3]);
+  return val;
 #endif
 }
 
@@ -569,8 +586,14 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
         "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
   return val;
 #else
-  // static_assert(false, "not supported.");
-  return 0;
+  uint32_t val;
+  __nv_fp4x2_storage_t vals[4];
+#pragma unroll
+  for (int i = 0; i < 4; i++) {
+    vals[i] = __nv_cvt_float2_to_fp4x2(array[i], __NV_E2M1, cudaRoundNearest);
+  }
+  val = pack_bytes(vals[0], vals[1], vals[2], vals[3]);
+  return val;
 #endif
 }
 

diff --git a/include/flashinfer/comm/trtllm_moe_allreduce_fusion.cuh b/include/flashinfer/comm/trtllm_moe_allreduce_fusion.cuh
@@ -1,6 +1,7 @@
 #include <cooperative_groups.h>
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
+#include <cuda_fp4.h>
 
 #include <cuda/std/optional>
 #include <tuple>
@@ -509,7 +510,17 @@ __device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(std::optional<int> batchI
   return nullptr;
 }
 
+__forceinline__ __device__ uint32_t pack_bytes(uint8_t c0, uint8_t c1, uint8_t c2, uint8_t c3) {
+  uint32_t val0 = c0;
+  uint32_t val1 = c1;
+  uint32_t val2 = c2;
+  uint32_t val3 = c3;
+
+  return (val3 << 24) | (val2 << 16) | (val1 << 8) | val0;
+}
+
 // Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
+// NOTE:bypass sm_100 requirement by __nv_cvt_float2_to_fp4x2
 inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
   uint32_t val;
@@ -530,8 +541,14 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
         "f"(array[6]), "f"(array[7]));
   return val;
 #else
-  // static_assert(false, "not supported.");
-  return 0;
+  uint32_t val;
+  __nv_fp4x2_storage_t vals[4];
+#pragma unroll
+  for (int i = 0; i < 4; i++) {
+    vals[i] = __nv_cvt_float2_to_fp4x2(*(((float2*)array) + i), __NV_E2M1, cudaRoundNearest);
+  }
+  val = pack_bytes(vals[0], vals[1], vals[2], vals[3]);
+  return val;
 #endif
 }
 
@@ -556,8 +573,14 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
         "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
   return val;
 #else
-  // static_assert(false, "not supported.");
-  return 0;
+  uint32_t val;
+  __nv_fp4x2_storage_t vals[4];
+#pragma unroll
+  for (int i = 0; i < 4; i++) {
+    vals[i] = __nv_cvt_float2_to_fp4x2(array[i], __NV_E2M1, cudaRoundNearest);
+  }
+  val = pack_bytes(vals[0], vals[1], vals[2], vals[3]);
+  return val;
 #endif
 }