diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml index e9b6e28fa6bcb..79b85d8cad0d5 100644 --- a/.github/workflows/clang-format.yml +++ b/.github/workflows/clang-format.yml @@ -30,12 +30,6 @@ jobs: run: | EXCLUDES=( 'csrc/moe/topk_softmax_kernels.cu' - 'csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu' - 'csrc/punica/bgmv/bgmv_config.h' - 'csrc/punica/bgmv/bgmv_impl.cuh' - 'csrc/punica/bgmv/vec_dtypes.cuh' - 'csrc/punica/punica_ops.cu' - 'csrc/punica/type_convert.h' ) find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \ | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \ diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 4869cad541135..69998b45be70a 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -181,7 +181,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) # # The torch cmake setup hardcodes the detected architecture flags in # `CMAKE_CUDA_FLAGS`. Since `CMAKE_CUDA_FLAGS` is a "global" variable, it - # can't modified on a per-target basis, e.g. for the `punica` extension. + # can't modified on a per-target basis. # So, all the `-gencode` flags need to be extracted and removed from # `CMAKE_CUDA_FLAGS` for processing so they can be passed by another method. # Since it's not possible to use `target_compiler_options` for adding target diff --git a/format.sh b/format.sh index abc688c702aa6..baaebc811d405 100755 --- a/format.sh +++ b/format.sh @@ -242,12 +242,6 @@ echo 'vLLM isort: Done' # NOTE: Keep up to date with .github/workflows/clang-format.yml CLANG_FORMAT_EXCLUDES=( 'csrc/moe/topk_softmax_kernels.cu' - 'csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu' - 'csrc/punica/bgmv/bgmv_config.h' - 'csrc/punica/bgmv/bgmv_impl.cuh' - 'csrc/punica/bgmv/vec_dtypes.cuh' - 'csrc/punica/punica_ops.cu' - 'csrc/punica/type_convert.h' ) # Format specified files with clang-format diff --git a/vllm/config.py b/vllm/config.py index 0524514f6633a..35945e34452d2 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1304,7 +1304,7 @@ class LoRAConfig: long_lora_scaling_factors: Optional[Tuple[float]] = None def __post_init__(self): - # Keep this in sync with csrc/punica/bgmv/bgmv_config.h + # TODO: Increase the range of rank possible_max_ranks = (8, 16, 32, 64) possible_lora_extra_vocab_size = (0, 256, 512) if self.max_lora_rank not in possible_max_ranks: diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 42ec99e6ea2c8..d3978ff6f4ff1 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -1073,7 +1073,7 @@ def create_lora_weights( lora_config: LoRAConfig, model_config: Optional[PretrainedConfig] = None, ) -> None: - # Keep this in sync with csrc/punica/bgmv/bgmv_config.h + # TODO: Verify if this condition can be relaxed if 32000 < self.base_layer.vocab_size > 128512: raise ValueError("When using LoRA, vocab size must be " "32000 >= vocab_size <= 128512")