Skip to content

Commit

Permalink
CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (ggerganov#2590)
Browse files Browse the repository at this point in the history
  • Loading branch information
JohannesGaessler authored Aug 12, 2023
1 parent b19edd5 commit f64d44a
Show file tree
Hide file tree
Showing 2 changed files with 606 additions and 410 deletions.
2 changes: 0 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ option(LLAMA_BLAS "llama: use BLAS"
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
option(LLAMA_CUBLAS "llama: use CUDA" OFF)
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
set(LLAMA_CUDA_MMQ_Y "64" CACHE STRING "llama: y tile size for mmq CUDA kernels")
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
Expand Down Expand Up @@ -256,7 +255,6 @@ if (LLAMA_CUBLAS)
# if (LLAMA_CUDA_CUBLAS)
# add_compile_definitions(GGML_CUDA_CUBLAS)
# endif()
add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y})
if (LLAMA_CUDA_FORCE_DMMV)
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
endif()
Expand Down
Loading

0 comments on commit f64d44a

Please sign in to comment.