Skip to content

Commit 6ad1baa

Browse files
committed
Quantized KV Cache : QKV - 22 modes
FA : 0 (F16) - 14, K and V can be quantized. No FA : 15 (F16) - 22 - K only will be quantize, and TG will be be 30-40% slower.
1 parent 6c94481 commit 6ad1baa

File tree

4 files changed

+289
-26
lines changed

4 files changed

+289
-26
lines changed

CMakeLists.txt

Lines changed: 155 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ option(LLAMA_HIPBLAS "llama: use hipBLAS"
6161
option(LLAMA_OPENMP "llama: use OpenMP" OFF)
6262

6363
# Croco.Cpp Specifics
64+
option(LLAMA_CUDA_FA_ALL_QUANTS "llama: compile 18 quants for FlashAttention" OFF)
6465
option(GGML_CUDA_USE_GRAPHS "Use Cuda Graphs to increase a bit performancess" OFF)
6566
SET(GGML_SCHED_MAX_COPIES "1" CACHE STRING "llama: max input copies for pipeline parallelism")
6667
option(LLAMA_CUDA_ENABLE_UNIFIED_MEMORY "llama: enable to avoid OOM in Full Offload" OFF)
@@ -115,13 +116,83 @@ if (LLAMA_CUBLAS)
115116
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
116117
endif()
117118

118-
# only build minimal quants required for fattn quant kv
119-
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
120-
list(APPEND GGML_SOURCES_CUDA ${SRCS})
121-
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
122-
list(APPEND GGML_SOURCES_CUDA ${SRCS})
123-
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
124-
list(APPEND GGML_SOURCES_CUDA ${SRCS})
119+
if (LLAMA_CUDA_FA_ALL_QUANTS)
120+
# all quants necessary for Kobold CPP Frankenstein are compiled
121+
# the other are ignored but not deleted from the ggml_cuda templates directory
122+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
123+
list(APPEND GGML_SOURCES_CUDA ${SRCS})
124+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q4_1-q4_0.cu")
125+
# list(APPEND GGML_SOURCES_CUDA ${SRCS})
126+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*iq4_nl-iq4_nl.cu")
127+
list(APPEND GGML_SOURCES_CUDA ${SRCS})
128+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q4_1-q4_1.cu")
129+
# list(APPEND GGML_SOURCES_CUDA ${SRCS})
130+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_0-q4_0.cu")
131+
# list(APPEND GGML_SOURCES_CUDA ${SRCS})
132+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_0-q4_1.cu")
133+
# list(APPEND GGML_SOURCES_CUDA ${SRCS})
134+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_0-iq4_nl.cu")
135+
list(APPEND GGML_SOURCES_CUDA ${SRCS})
136+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_0-q5_0.cu")
137+
# list(APPEND GGML_SOURCES_CUDA ${SRCS})
138+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_1-q4_0.cu")
139+
# list(APPEND GGML_SOURCES_CUDA ${SRCS})
140+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_1-q4_1.cu")
141+
# list(APPEND GGML_SOURCES_CUDA ${SRCS})
142+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_1-iq4_nl.cu")
143+
list(APPEND GGML_SOURCES_CUDA ${SRCS})
144+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_1-q5_0.cu")
145+
list(APPEND GGML_SOURCES_CUDA ${SRCS})
146+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_1-q5_1.cu")
147+
# list(APPEND GGML_SOURCES_CUDA ${SRCS})
148+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q6_0-iq4_nl.cu")
149+
list(APPEND GGML_SOURCES_CUDA ${SRCS})
150+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q6_0-q5_0.cu")
151+
list(APPEND GGML_SOURCES_CUDA ${SRCS})
152+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q6_0-q6_0.cu")
153+
list(APPEND GGML_SOURCES_CUDA ${SRCS})
154+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q4_0.cu")
155+
# list(APPEND GGML_SOURCES_CUDA ${SRCS})
156+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q4_1.cu")
157+
# list(APPEND GGML_SOURCES_CUDA ${SRCS})
158+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-iq4_nl.cu")
159+
list(APPEND GGML_SOURCES_CUDA ${SRCS})
160+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q5_0.cu")
161+
list(APPEND GGML_SOURCES_CUDA ${SRCS})
162+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q6_0.cu")
163+
list(APPEND GGML_SOURCES_CUDA ${SRCS})
164+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q5_1.cu")
165+
# list(APPEND GGML_SOURCES_CUDA ${SRCS})
166+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
167+
list(APPEND GGML_SOURCES_CUDA ${SRCS})
168+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-f16.cu")
169+
# list(APPEND GGML_SOURCES_CUDA ${SRCS})
170+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q4_0.cu")
171+
# list(APPEND GGML_SOURCES_CUDA ${SRCS})
172+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q4_1.cu")
173+
# list(APPEND GGML_SOURCES_CUDA ${SRCS})
174+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q5_0.cu")
175+
# list(APPEND GGML_SOURCES_CUDA ${SRCS})
176+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q5_1.cu")
177+
# list(APPEND GGML_SOURCES_CUDA ${SRCS})
178+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q6_0.cu")
179+
list(APPEND GGML_SOURCES_CUDA ${SRCS})
180+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q8_0.cu")
181+
list(APPEND GGML_SOURCES_CUDA ${SRCS})
182+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
183+
list(APPEND GGML_SOURCES_CUDA ${SRCS})
184+
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
185+
else ()
186+
# only build minimal quants required for fattn quant kv
187+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*iq4_nl-iq4_nl.cu")
188+
list(APPEND GGML_SOURCES_CUDA ${SRCS})
189+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
190+
list(APPEND GGML_SOURCES_CUDA ${SRCS})
191+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
192+
list(APPEND GGML_SOURCES_CUDA ${SRCS})
193+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
194+
list(APPEND GGML_SOURCES_CUDA ${SRCS})
195+
endif()
125196

126197
if (LLAMA_CUDA_ENABLE_UNIFIED_MEMORY)
127198
add_compile_definitions(GGML_CUDA_ENABLE_UNIFIED_MEMORY)
@@ -201,13 +272,83 @@ if (LLAMA_HIPBLAS)
201272
target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_FORCE_DMMV)
202273
endif()
203274

204-
# only build minimal quants required for fattn quant kv
205-
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
206-
list(APPEND GGML_SOURCES_ROCM ${SRCS})
207-
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
208-
list(APPEND GGML_SOURCES_ROCM ${SRCS})
209-
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
210-
list(APPEND GGML_SOURCES_ROCM ${SRCS})
275+
if (LLAMA_CUDA_FA_ALL_QUANTS)
276+
# all quants necessary for Kobold CPP Frankenstein are compiled
277+
# the other are ignored but not deleted from the ggml_cuda templates directory
278+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
279+
list(APPEND GGML_SOURCES_ROCM ${SRCS})
280+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q4_1-q4_0.cu")
281+
# list(APPEND GGML_SOURCES_ROCM ${SRCS})
282+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*iq4_nl-iq4_nl.cu")
283+
list(APPEND GGML_SOURCES_ROCM ${SRCS})
284+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q4_1-q4_1.cu")
285+
# list(APPEND GGML_SOURCES_ROCM ${SRCS})
286+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_0-q4_0.cu")
287+
# list(APPEND GGML_SOURCES_ROCM ${SRCS})
288+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_0-q4_1.cu")
289+
# list(APPEND GGML_SOURCES_ROCM ${SRCS})
290+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_0-iq4_nl.cu")
291+
list(APPEND GGML_SOURCES_ROCM ${SRCS})
292+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_0-q5_0.cu")
293+
# list(APPEND GGML_SOURCES_ROCM ${SRCS})
294+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_1-q4_0.cu")
295+
# list(APPEND GGML_SOURCES_ROCM ${SRCS})
296+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_1-q4_1.cu")
297+
# list(APPEND GGML_SOURCES_ROCM ${SRCS})
298+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_1-iq4_nl.cu")
299+
list(APPEND GGML_SOURCES_ROCM ${SRCS})
300+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_1-q5_0.cu")
301+
list(APPEND GGML_SOURCES_ROCM ${SRCS})
302+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_1-q5_1.cu")
303+
# list(APPEND GGML_SOURCES_ROCM ${SRCS})
304+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q6_0-iq4_nl.cu")
305+
list(APPEND GGML_SOURCES_ROCM ${SRCS})
306+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q6_0-q5_0.cu")
307+
list(APPEND GGML_SOURCES_ROCM ${SRCS})
308+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q6_0-q6_0.cu")
309+
list(APPEND GGML_SOURCES_ROCM ${SRCS})
310+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q4_0.cu")
311+
# list(APPEND GGML_SOURCES_ROCM ${SRCS})
312+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q4_1.cu")
313+
# list(APPEND GGML_SOURCES_ROCM ${SRCS})
314+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-iq4_nl.cu")
315+
list(APPEND GGML_SOURCES_ROCM ${SRCS})
316+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q5_0.cu")
317+
list(APPEND GGML_SOURCES_ROCM ${SRCS})
318+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q6_0.cu")
319+
list(APPEND GGML_SOURCES_ROCM ${SRCS})
320+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q5_1.cu")
321+
# list(APPEND GGML_SOURCES_ROCM ${SRCS})
322+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
323+
list(APPEND GGML_SOURCES_ROCM ${SRCS})
324+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-f16.cu")
325+
# list(APPEND GGML_SOURCES_ROCM ${SRCS})
326+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q4_0.cu")
327+
# list(APPEND GGML_SOURCES_ROCM ${SRCS})
328+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q4_1.cu")
329+
# list(APPEND GGML_SOURCES_ROCM ${SRCS})
330+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q5_0.cu")
331+
# list(APPEND GGML_SOURCES_ROCM ${SRCS})
332+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q5_1.cu")
333+
# list(APPEND GGML_SOURCES_ROCM ${SRCS})
334+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q6_0.cu")
335+
list(APPEND GGML_SOURCES_ROCM ${SRCS})
336+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q8_0.cu")
337+
list(APPEND GGML_SOURCES_ROCM ${SRCS})
338+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
339+
list(APPEND GGML_SOURCES_ROCM ${SRCS})
340+
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
341+
else ()
342+
# only build minimal quants required for fattn quant kv
343+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*iq4_nl-iq4_nl.cu")
344+
list(APPEND GGML_SOURCES_ROCM ${SRCS})
345+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
346+
list(APPEND GGML_SOURCES_ROCM ${SRCS})
347+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
348+
list(APPEND GGML_SOURCES_ROCM ${SRCS})
349+
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
350+
list(APPEND GGML_SOURCES_ROCM ${SRCS})
351+
endif()
211352

212353
# only build minimal quants required for fattn quant kv
213354
target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})

Makefile

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -177,9 +177,48 @@ endif
177177
# it is recommended to use the CMAKE file to build for cublas if you can - will likely work better
178178
OBJS_CUDA_TEMP_INST = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu))
179179
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/mmq*.cu))
180-
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
181-
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
182-
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
180+
181+
ifdef LLAMA_CUDA_FA_ALL_QUANTS
182+
# all quants necessary for Kobold CPP Frankenstein are compiled
183+
# the other are ignored but not deleted from the ggml_cuda templates directory
184+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*iq4_nl-iq4_nl.cu))
185+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
186+
# OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q4_1-q4_0.cu))
187+
# OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q4_1-q4_1.cu))
188+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q5_0-iq4_nl.cu))
189+
# OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q5_0-q4_0.cu))
190+
# OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q5_0-q4_1.cu))
191+
# OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q5_0-q5_0.cu))
192+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q5_1-iq4_nl.cu))
193+
# OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q5_1-q4_0.cu))
194+
# OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q5_1-q4_1.cu))
195+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q5_1-q5_0.cu))
196+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q6_0-iq4_nl.cu))
197+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q6_0-q5_0.cu))
198+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q6_0-q6_0.cu))
199+
# OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q5_1-q5_1.cu))
200+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-iq4_nl.cu))
201+
# OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q4_0.cu))
202+
# OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q4_1.cu))
203+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q5_0.cu))
204+
# OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q5_1.cu))
205+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*:q8_0-q6_0.cu))
206+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
207+
# OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-f16.cu))
208+
# OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q4_0.cu))
209+
# OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q4_1.cu))
210+
# OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q5_0.cu))
211+
# OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q5_1.cu))
212+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q8_0.cu))
213+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
214+
MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
215+
HIPFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
216+
else
217+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-iq4_nl.cu))
218+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
219+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
220+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
221+
endif # LLAMA_CUDA_FA_ALL_QUANTS
183222

184223
ifdef LLAMA_CUBLAS
185224
CUBLAS_FLAGS = -DGGML_USE_CUDA -DSD_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include

0 commit comments

Comments
 (0)