diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index a5ddeac740440..b5be3befa07e2 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -25,7 +25,9 @@ #include "attention_dtypes.h" #include "attention_utils.cuh" +#ifdef ENABLE_FP8_E5M2 #include "../quantization/fp8_e5m2_kvcache/quant_utils.cuh" +#endif #include diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index fe0159e404585..ceb7347d94670 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -4,13 +4,20 @@ #include "cuda_compat.h" #include "dispatch_utils.h" +#ifdef ENABLE_FP8_E5M2 #include "quantization/fp8_e5m2_kvcache/quant_utils.cuh" +#endif #include #include #include #include +#ifdef USE_ROCM + #include + typedef __hip_bfloat16 __nv_bfloat16; +#endif + void swap_blocks( torch::Tensor& src, torch::Tensor& dst, diff --git a/csrc/quantization/fp8_e5m2_kvcache/quant_utils.cuh b/csrc/quantization/fp8_e5m2_kvcache/quant_utils.cuh index c3b0d311b89cc..9bcab25db03cf 100644 --- a/csrc/quantization/fp8_e5m2_kvcache/quant_utils.cuh +++ b/csrc/quantization/fp8_e5m2_kvcache/quant_utils.cuh @@ -9,7 +9,6 @@ #include "../../attention/dtype_float16.cuh" #include "../../attention/dtype_bfloat16.cuh" -#pragma once namespace vllm { #ifdef ENABLE_FP8_E5M2