@@ -61,6 +61,7 @@ option(LLAMA_HIPBLAS "llama: use hipBLAS"
6161option (LLAMA_OPENMP "llama: use OpenMP" OFF )
6262
6363# Croco.Cpp Specifics
64+ option (LLAMA_CUDA_FA_ALL_QUANTS "llama: compile 18 quants for FlashAttention" OFF )
6465option (GGML_CUDA_USE_GRAPHS "Use Cuda Graphs to increase a bit performancess" OFF )
6566SET (GGML_SCHED_MAX_COPIES "1" CACHE STRING "llama: max input copies for pipeline parallelism" )
6667option (LLAMA_CUDA_ENABLE_UNIFIED_MEMORY "llama: enable to avoid OOM in Full Offload" OFF )
@@ -115,13 +116,83 @@ if (LLAMA_CUBLAS)
115116 add_compile_definitions (GGML_CUDA_USE_GRAPHS)
116117 endif ()
117118
118- # only build minimal quants required for fattn quant kv
119- file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu" )
120- list (APPEND GGML_SOURCES_CUDA ${SRCS} )
121- file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu" )
122- list (APPEND GGML_SOURCES_CUDA ${SRCS} )
123- file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu" )
124- list (APPEND GGML_SOURCES_CUDA ${SRCS} )
119+ if (LLAMA_CUDA_FA_ALL_QUANTS)
120+ # all quants necessary for Kobold CPP Frankenstein are compiled
121+ # the other are ignored but not deleted from the ggml_cuda templates directory
122+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu" )
123+ list (APPEND GGML_SOURCES_CUDA ${SRCS} )
124+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q4_1-q4_0.cu")
125+ # list(APPEND GGML_SOURCES_CUDA ${SRCS})
126+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*iq4_nl-iq4_nl.cu" )
127+ list (APPEND GGML_SOURCES_CUDA ${SRCS} )
128+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q4_1-q4_1.cu")
129+ # list(APPEND GGML_SOURCES_CUDA ${SRCS})
130+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_0-q4_0.cu")
131+ # list(APPEND GGML_SOURCES_CUDA ${SRCS})
132+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_0-q4_1.cu")
133+ # list(APPEND GGML_SOURCES_CUDA ${SRCS})
134+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_0-iq4_nl.cu" )
135+ list (APPEND GGML_SOURCES_CUDA ${SRCS} )
136+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_0-q5_0.cu")
137+ # list(APPEND GGML_SOURCES_CUDA ${SRCS})
138+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_1-q4_0.cu")
139+ # list(APPEND GGML_SOURCES_CUDA ${SRCS})
140+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_1-q4_1.cu")
141+ # list(APPEND GGML_SOURCES_CUDA ${SRCS})
142+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_1-iq4_nl.cu" )
143+ list (APPEND GGML_SOURCES_CUDA ${SRCS} )
144+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_1-q5_0.cu" )
145+ list (APPEND GGML_SOURCES_CUDA ${SRCS} )
146+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_1-q5_1.cu")
147+ # list(APPEND GGML_SOURCES_CUDA ${SRCS})
148+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q6_0-iq4_nl.cu" )
149+ list (APPEND GGML_SOURCES_CUDA ${SRCS} )
150+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q6_0-q5_0.cu" )
151+ list (APPEND GGML_SOURCES_CUDA ${SRCS} )
152+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q6_0-q6_0.cu" )
153+ list (APPEND GGML_SOURCES_CUDA ${SRCS} )
154+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q4_0.cu")
155+ # list(APPEND GGML_SOURCES_CUDA ${SRCS})
156+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q4_1.cu")
157+ # list(APPEND GGML_SOURCES_CUDA ${SRCS})
158+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-iq4_nl.cu" )
159+ list (APPEND GGML_SOURCES_CUDA ${SRCS} )
160+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q5_0.cu" )
161+ list (APPEND GGML_SOURCES_CUDA ${SRCS} )
162+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q6_0.cu" )
163+ list (APPEND GGML_SOURCES_CUDA ${SRCS} )
164+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q5_1.cu")
165+ # list(APPEND GGML_SOURCES_CUDA ${SRCS})
166+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu" )
167+ list (APPEND GGML_SOURCES_CUDA ${SRCS} )
168+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-f16.cu")
169+ # list(APPEND GGML_SOURCES_CUDA ${SRCS})
170+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q4_0.cu")
171+ # list(APPEND GGML_SOURCES_CUDA ${SRCS})
172+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q4_1.cu")
173+ # list(APPEND GGML_SOURCES_CUDA ${SRCS})
174+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q5_0.cu")
175+ # list(APPEND GGML_SOURCES_CUDA ${SRCS})
176+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q5_1.cu")
177+ # list(APPEND GGML_SOURCES_CUDA ${SRCS})
178+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q6_0.cu" )
179+ list (APPEND GGML_SOURCES_CUDA ${SRCS} )
180+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q8_0.cu" )
181+ list (APPEND GGML_SOURCES_CUDA ${SRCS} )
182+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu" )
183+ list (APPEND GGML_SOURCES_CUDA ${SRCS} )
184+ add_compile_definitions (GGML_CUDA_FA_ALL_QUANTS)
185+ else ()
186+ # only build minimal quants required for fattn quant kv
187+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*iq4_nl-iq4_nl.cu" )
188+ list (APPEND GGML_SOURCES_CUDA ${SRCS} )
189+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu" )
190+ list (APPEND GGML_SOURCES_CUDA ${SRCS} )
191+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu" )
192+ list (APPEND GGML_SOURCES_CUDA ${SRCS} )
193+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu" )
194+ list (APPEND GGML_SOURCES_CUDA ${SRCS} )
195+ endif ()
125196
126197 if (LLAMA_CUDA_ENABLE_UNIFIED_MEMORY)
127198 add_compile_definitions (GGML_CUDA_ENABLE_UNIFIED_MEMORY)
@@ -201,13 +272,83 @@ if (LLAMA_HIPBLAS)
201272 target_compile_definitions (ggml-rocm PUBLIC GGML_CUDA_FORCE_DMMV)
202273 endif ()
203274
204- # only build minimal quants required for fattn quant kv
205- file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu" )
206- list (APPEND GGML_SOURCES_ROCM ${SRCS} )
207- file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu" )
208- list (APPEND GGML_SOURCES_ROCM ${SRCS} )
209- file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu" )
210- list (APPEND GGML_SOURCES_ROCM ${SRCS} )
275+ if (LLAMA_CUDA_FA_ALL_QUANTS)
276+ # all quants necessary for Kobold CPP Frankenstein are compiled
277+ # the other are ignored but not deleted from the ggml_cuda templates directory
278+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu" )
279+ list (APPEND GGML_SOURCES_ROCM ${SRCS} )
280+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q4_1-q4_0.cu")
281+ # list(APPEND GGML_SOURCES_ROCM ${SRCS})
282+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*iq4_nl-iq4_nl.cu" )
283+ list (APPEND GGML_SOURCES_ROCM ${SRCS} )
284+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q4_1-q4_1.cu")
285+ # list(APPEND GGML_SOURCES_ROCM ${SRCS})
286+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_0-q4_0.cu")
287+ # list(APPEND GGML_SOURCES_ROCM ${SRCS})
288+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_0-q4_1.cu")
289+ # list(APPEND GGML_SOURCES_ROCM ${SRCS})
290+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_0-iq4_nl.cu" )
291+ list (APPEND GGML_SOURCES_ROCM ${SRCS} )
292+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_0-q5_0.cu")
293+ # list(APPEND GGML_SOURCES_ROCM ${SRCS})
294+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_1-q4_0.cu")
295+ # list(APPEND GGML_SOURCES_ROCM ${SRCS})
296+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_1-q4_1.cu")
297+ # list(APPEND GGML_SOURCES_ROCM ${SRCS})
298+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_1-iq4_nl.cu" )
299+ list (APPEND GGML_SOURCES_ROCM ${SRCS} )
300+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_1-q5_0.cu" )
301+ list (APPEND GGML_SOURCES_ROCM ${SRCS} )
302+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q5_1-q5_1.cu")
303+ # list(APPEND GGML_SOURCES_ROCM ${SRCS})
304+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q6_0-iq4_nl.cu" )
305+ list (APPEND GGML_SOURCES_ROCM ${SRCS} )
306+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q6_0-q5_0.cu" )
307+ list (APPEND GGML_SOURCES_ROCM ${SRCS} )
308+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q6_0-q6_0.cu" )
309+ list (APPEND GGML_SOURCES_ROCM ${SRCS} )
310+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q4_0.cu")
311+ # list(APPEND GGML_SOURCES_ROCM ${SRCS})
312+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q4_1.cu")
313+ # list(APPEND GGML_SOURCES_ROCM ${SRCS})
314+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-iq4_nl.cu" )
315+ list (APPEND GGML_SOURCES_ROCM ${SRCS} )
316+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q5_0.cu" )
317+ list (APPEND GGML_SOURCES_ROCM ${SRCS} )
318+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q6_0.cu" )
319+ list (APPEND GGML_SOURCES_ROCM ${SRCS} )
320+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q5_1.cu")
321+ # list(APPEND GGML_SOURCES_ROCM ${SRCS})
322+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu" )
323+ list (APPEND GGML_SOURCES_ROCM ${SRCS} )
324+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-f16.cu")
325+ # list(APPEND GGML_SOURCES_ROCM ${SRCS})
326+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q4_0.cu")
327+ # list(APPEND GGML_SOURCES_ROCM ${SRCS})
328+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q4_1.cu")
329+ # list(APPEND GGML_SOURCES_ROCM ${SRCS})
330+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q5_0.cu")
331+ # list(APPEND GGML_SOURCES_ROCM ${SRCS})
332+ # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q5_1.cu")
333+ # list(APPEND GGML_SOURCES_ROCM ${SRCS})
334+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q6_0.cu" )
335+ list (APPEND GGML_SOURCES_ROCM ${SRCS} )
336+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q8_0.cu" )
337+ list (APPEND GGML_SOURCES_ROCM ${SRCS} )
338+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu" )
339+ list (APPEND GGML_SOURCES_ROCM ${SRCS} )
340+ add_compile_definitions (GGML_CUDA_FA_ALL_QUANTS)
341+ else ()
342+ # only build minimal quants required for fattn quant kv
343+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*iq4_nl-iq4_nl.cu" )
344+ list (APPEND GGML_SOURCES_ROCM ${SRCS} )
345+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu" )
346+ list (APPEND GGML_SOURCES_ROCM ${SRCS} )
347+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu" )
348+ list (APPEND GGML_SOURCES_ROCM ${SRCS} )
349+ file (GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu" )
350+ list (APPEND GGML_SOURCES_ROCM ${SRCS} )
351+ endif ()
211352
212353 # only build minimal quants required for fattn quant kv
213354 target_compile_definitions (ggml-rocm PUBLIC GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X} )
0 commit comments