@@ -200,7 +200,7 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT CMAKE_SYSTEM_NAME MATCHES "A
200200 endif ()
201201
202202 # Set C++ compiler and flags
203- set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstack-protector -fstack-protector-all -Wall -Wextra -Werror -Wno-error=int-in-bool-context -Wno-unused-variable -Wno-error=implicit-fallthrough -Wno-return-type -Wno-unused-parameter -Wno-error=unknown-pragmas -ggdb3 -lpthread -pthread -MT -Bsymbolic -lbfd -rdynamic -lunwind -ldw -ldl -fno-omit-frame-pointer -fno-optimize-sibling-calls -rdynamic -finstrument-functions -O0 -fPIC" )
203+ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstack-protector -fstack-protector-all -Wall -Wextra -Werror -Wno-return-type -Wno-error=int-in-bool-context -Wno-unused-variable -Wno-error=implicit-fallthrough -Wno-return-type -Wno-unused-parameter -Wno-error=unknown-pragmas -ggdb3 -lpthread -pthread -MT -Bsymbolic -lbfd -rdynamic -lunwind -ldw -ldl -fno-omit-frame-pointer -fno-optimize-sibling-calls -rdynamic -finstrument-functions -O0 -fPIC" )
204204 add_compile_definitions (SD_GCC_FUNCTRACE)
205205 endif ()
206206endif ()
@@ -262,7 +262,7 @@ if(SD_CUDA)
262262
263263 if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" )
264264 if (SD_GCC_FUNCTRACE STREQUAL "ON" )
265- set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wall -Wno-unused-variable -Wno-unused-parameter -Wreturn-type -W -ggdb3 -fPIC -DSD_GCC_FUNCTRACE=1 -Bsymbolic -lbfd -rdynamic -lunwind -ldw -ldl -fno-omit-frame-pointer -fno-optimize-sibling-calls -finstrument-functions -O0" )
265+ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wall -Wno-return-type -Wno-unknown-pragmas -Wno- unused-variable -Wno-unused-parameter -Wreturn-type -W -ggdb3 -fPIC -DSD_GCC_FUNCTRACE=1 -Bsymbolic -lbfd -rdynamic -lunwind -ldw -ldl -fno-omit-frame-pointer -fno-optimize-sibling-calls -finstrument-functions -O0" )
266266 set (CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=-fPIC --device-debug -lineinfo -G" )
267267 add_compile_definitions (SD_GCC_FUNCTRACE)
268268 else ()
@@ -306,7 +306,7 @@ if(SD_CUDA)
306306 endif ()
307307
308308 # Cap the number of registers to prevent resource exhaustion
309- set (CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -- maxrregcount=40 " )
309+ set (CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -maxrregcount=128 " )
310310
311311 # Define CUDA Architectures
312312 string (TOLOWER "${COMPUTE} " COMPUTE_CMP)
@@ -371,6 +371,11 @@ if(SD_CUDA)
371371 endif ()
372372 include (${CMAKE_CURRENT_SOURCE_DIR} /../cmake/TypeMST.cmake)
373373
374+ # 2. Process the compilation_units templates
375+ file (GLOB CUDA_COMPILATION_UNITS ../include /loops/cuda/compilation_units/*.cu.in)
376+ foreach (FL_ITEM ${CUDA_COMPILATION_UNITS} )
377+ genCompilation(${FL_ITEM} )
378+ endforeach ()
374379
375380 # Decide whether to use all combinations or optimized MST combinations
376381 set (SD_USE_MST_TYPES ON )
@@ -423,7 +428,7 @@ if(SD_CUDA)
423428 file (GLOB_RECURSE INDEXING_SOURCES ../include /indexing/*.cpp ../include /indexing/*.h)
424429 file (GLOB_RECURSE LOOPS_SOURCES ../include /loops/impl/*.cpp ../include /loops/*.h ../include /loops/*.chpp)
425430 file (GLOB_RECURSE LEGACY_SOURCES ../include /legacy/impl/*.cpp ../include /legacy/*.cu ../include /legacy/*.h)
426- file (GLOB_RECURSE LOOPS_SOURCES_CUDA ../include /loops/*.cu)
431+ file (GLOB_RECURSE LOOPS_SOURCES_CUDA ../include /loops/*.cu ../ include /loops/cuda/**/*.cu )
427432 file (GLOB_RECURSE COMPILATION_UNITS ../include /loops/cuda/compilation_units/*.cu.in ../include /ops/impl/compilation_units/*.cpp.in)
428433 file (GLOB_RECURSE COMPILATION_UNITS ../include /loops/cuda/compilation_units/*.cu.in ../include /loops/cuda/comb_compilation_units/*.cu.in ../include /ops/impl/compilation_units/*.cpp.in)
429434
@@ -485,6 +490,8 @@ if(SD_CUDA)
485490 ${CUSTOMOPS_ONEDNN_SOURCES}
486491 ${CUSTOMOPS_ARMCOMPUTE_SOURCES}
487492 ${CUSTOMOPS_GENERIC_SOURCES}
493+ ${LOOPS_SOURCES_CUDA}
494+
488495 )
489496 else ()
490497 add_library (samediff_obj OBJECT
@@ -509,6 +516,8 @@ if(SD_CUDA)
509516 ${CUSTOMOPS_ONEDNN_SOURCES}
510517 ${CUSTOMOPS_ARMCOMPUTE_SOURCES}
511518 ${CUSTOMOPS_GENERIC_SOURCES}
519+ ${LOOPS_SOURCES_CUDA}
520+
512521 )
513522 endif ()
514523
0 commit comments