termi-official · Abdelrahman912 · Dec 18, 2024 · Dec 20, 2024 · Dec 21, 2024 · Jan 6, 2025
diff --git a/Project.toml b/Project.toml
@@ -55,11 +55,12 @@ julia = "1.10"
 
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Tensors = "48a634ad-e948-5137-8d70-aa71f2a747f4"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Aqua", "DelimitedFiles", "Pkg", "StaticArrays", "Tensors", "Test"]
+test = ["Aqua", "DelimitedFiles", "Pkg", "StaticArrays", "Tensors", "Test", "CUDA"]
diff --git a/benchmarks/benchmarks-cuda-linear-form.jl b/benchmarks/benchmarks-cuda-linear-form.jl
@@ -0,0 +1,67 @@
+using BenchmarkTools, Thunderbolt, StaticArrays,Ferrite , CUDA
+
+# The Following is needed to enforce grid to use Float32.
+left = Tensor{1, 3, Float32}((-1.0, -1.0,-1.0)) 
+right = Tensor{1, 3, Float32}((1.0, 1.0, 1.0)) 
+
+grid = generate_grid(Hexahedron , (500,100,100),left,right)
+
+ip_collection = LagrangeCollection{1}()
+qr_collection = QuadratureRuleCollection(2)
+dh = DofHandler(grid)
+add!(dh, :u, getinterpolation(ip_collection, first(grid.cells)))
+close!(dh)
+cs = CartesianCoordinateSystem(grid)
+protocol = AnalyticalTransmembraneStimulationProtocol(
+                AnalyticalCoefficient((x,t) -> cos(2π * t) * exp(-norm(x)^2), CoordinateSystemCoefficient(cs)),
+                [SVector((0.f0, 1.f0))]
+            )
+
+
+
+#############################
+# CPU operator Benchmarking #
+#############################
+
+linop = Thunderbolt.LinearOperator(
+    zeros(ndofs(dh)),
+    protocol,
+    qr_collection,
+    dh,
+)
+
+@benchmark Thunderbolt.update_operator!($linop,$0.0)
+
+
+#############################
+# GPU operator Benchmarking #
+#############################
+
+cuda_strategy = Thunderbolt.CudaAssemblyStrategy()
+# Notes on launch configuration:
+# These values are based on optimal occupancy of my GPU (Nvidia GeForce RTX 3050 Ti w 4GB VRAM) from Nsight Compute.
+# The number of threads per block is 384, and the number of blocks (no. SMs) is 20.
+cuda_op = Thunderbolt.init_linear_operator(cuda_strategy,protocol, qr_collection, dh;n_threads = 384,n_blocks = 20);
+
+## benchmark with BenchmarkTools
+@btime Thunderbolt.update_operator!($cuda_op,$0.f0)
+
+# benchmark with CUDA/Nvidia tools
+# Nsight Compute command: ncu --mode=launch julia
+# Note: run twice to get the correct time.
+Thunderbolt.update_operator!(cuda_op,0.f0) # warm up
+CUDA.@profile trace=true Thunderbolt.update_operator!(cuda_op,0.f0)
+
+
+######################################
+# CPU threaded operator Benchmarking #
+######################################
+
+plinop = Thunderbolt.PEALinearOperator(
+    zeros(ndofs(dh)),
+    qr_collection,
+    protocol,
+    dh,
+);
+
+@benchmark Thunderbolt.update_operator!($plinop,$0.0)
diff --git a/benchmarks/benchmarks-linear-form.jl b/benchmarks/benchmarks-linear-form.jl
@@ -1,6 +1,6 @@
-using BenchmarkTools, Thunderbolt, StaticArrays
+using BenchmarkTools, Thunderbolt, StaticArrays,Ferrite
 
-grid = generate_grid(celltype, (1,1,1))
+grid = generate_grid(Hexahedron , (1,1,1))
 cell_cache = Ferrite.CellCache(grid)
 reinit!(cell_cache,1)
 
@@ -9,12 +9,19 @@ ip = getinterpolation(ip_collection, grid.cells[1])
 qr_collection = QuadratureRuleCollection(2)
 qr = getquadraturerule(qr_collection, grid.cells[1])
 cv = CellValues(qr, ip)
+dh = DofHandler(grid)
+add!(dh, :u, getinterpolation(ip_collection, first(grid.cells)))
+close!(dh)
+sdh = first(dh.subdofhandlers)
 ac = AnalyticalCoefficient(
     (x,t) -> norm(x)+t,
     CoordinateSystemCoefficient(
         CartesianCoordinateSystem(grid)
     )
 )
+
+coeff_cache = Thunderbolt.setup_coefficient_cache(ac, qr, sdh)
+
 b = zeros(8)
-element_cache = Thunderbolt.AnalyticalCoefficientElementCache(ac, [SVector((0.0,1.0))], cv)
+element_cache = Thunderbolt.AnalyticalCoefficientElementCache(coeff_cache, [SVector((0.0,1.0))], cv)
 @btime Thunderbolt.assemble_element!($b, $cell_cache, $element_cache, 0.0)
diff --git a/ext/CuThunderboltExt.jl b/ext/CuThunderboltExt.jl
@@ -3,83 +3,42 @@ module CuThunderboltExt
 using Thunderbolt
 
 import CUDA:
-    CUDA, CuArray, CuVector, CUSPARSE,
-    threadIdx, blockIdx, blockDim, @cuda,
-    CUDABackend, launch_configuration
+    CUDA, CuArray, CuVector, CUSPARSE,blockDim,blockIdx,gridDim,threadIdx,
+    threadIdx, blockIdx, blockDim, @cuda, @cushow,
+    CUDABackend, launch_configuration, device, cu,cudaconvert
 
 import Thunderbolt:
     UnPack.@unpack,
     SimpleMesh,
     SparseMatrixCSR, SparseMatrixCSC,
     AbstractSemidiscreteFunction, AbstractPointwiseFunction, solution_size,
-    AbstractPointwiseSolverCache,
-    GPUDofHandlerData, GPUSubDofHandlerData, GPUDofHandler,
-    GPUGrid
+    AbstractPointwiseSolverCache,assemble_element!,
+    LinearOperator,QuadratureRuleCollection,
+    AnalyticalCoefficientElementCache,AnalyticalCoefficientCache,CartesianCoordinateSystemCache,
+    setup_element_cache,update_operator!,init_linear_operator,FieldCoefficientCache, CudaAssemblyStrategy, floattype,inttype, 
+    convert_vec_to_concrete,deep_adapt,AbstractElementAssembly,GeneralLinearOperator
 
-import Ferrite:
-    AbstractDofHandler
-
-import Adapt:
-    Adapt, adapt_structure, adapt
-
-# ---------------------- Generic part ------------------------
-function _convert_subdofhandler_to_gpu(cell_dofs, cell_dof_soffset, sdh::SubDofHandler)
-    GPUSubDofHandler(
-        cell_dofs,
-        cell_dofs_offset,
-        adapt(typeof(cell_dofs), collect(sdh.cellset)),
-        Tuple(sym for sym in sdh.field_names),
-        Tuple(sym for sym in sdh.field_n_components),
-        sdh.ndofs_per_cell.x,
-    )
-end
+import Thunderbolt.FerriteUtils:
+    StaticInterpolationValues,StaticCellValues, allocate_device_mem,
+    CellIterator, mem_size, cellmem,ncells,celldofsview,
+    DeviceDofHandlerData, DeviceSubDofHandler, DeviceDofHandler, DeviceGrid,
+    cellfe, AbstractDeviceGlobalMem, AbstractDeviceSharedMem,AbstractDeviceCellIterator,AbstractCellMem,
+    FeMemShape, KeMemShape, KeFeMemShape, DeviceCellIterator,DeviceOutOfBoundCellIterator,DeviceCellCache,
+    FeCellMem, KeCellMem, KeFeCellMem,NoCellMem,AbstractMemShape
 
-function Adapt.adapt_structure(to::Type{CUDABackend}, dh::DofHandler{sdim}) where sdim
-    grid             = adapt_structure(to, dh.grid)
-    # field_names      = Tuple(sym for sym in dh.field_names)
-    IndexType        = eltype(dh.cell_dofs)
-    IndexVectorType  = CuVector{IndexType}
-    cell_dofs        = adapt(IndexVectorType, dh.cell_dofs)
-    cell_dofs_offset = adapt(IndexVectorType, dh.cell_dofs_offset)
-    cell_to_sdh      = adapt(IndexVectorType, dh.cell_to_subdofhandler)
-    subdofhandlers   = Tuple(i->_convert_subdofhandler_to_gpu(cell_dofs, cell_dofs_offset, sdh) for sdh in dh.subdofhandlers)
-    gpudata = GPUDofHandlerData(
-        grid,
-        subdofhandlers,
-        # field_names,
-        cell_dofs,
-        cell_dofs_offset,
-        cell_to_sdh,
-        dh.ndofs.x,
-    )
-    return GPUDofHandler(dh, gpudata)
-end
 
+import Ferrite:
+    AbstractDofHandler,get_grid,CellIterator,get_node_coordinate,getcoordinates,get_coordinate_eltype,getcells,
+    get_node_ids,get_coordinate_type,nnodes
 
+import StaticArrays:
+    SVector,MVector
+
+import Adapt:
+    Adapt, adapt_structure, adapt, @adapt_structure
 
-function Adapt.adapt_structure(to::Type{CUDABackend}, grid::Grid{sdim, cell_type, T}) where {sdim, cell_type, T}
-    node_type = typeof(first(grid.nodes))
-    cells = Adapt.adapt_structure(to, grid.cells)
-    nodes = Adapt.adapt_structure(to, grid.nodes)
-    #TODO subdomain info
-    return GPUGrid{sdim, cell_type, T, typeof(cells), typeof(nodes)}(cells, nodes)
-end
 
-# function Thunderbolt.setup_operator(protocol::Thunderbolt.AnalyticalTransmembraneStimulationProtocol, solver::Thunderbolt.AbstractSolver, dh::GPUDofHandler, field_name::Symbol, qr)
-#     ip = dh.dh.subdofhandlers[1].field_interpolations[1]
-#     ip_g = Ferrite.geometric_interpolation(typeof(getcells(Ferrite.get_grid(dh), 1)))
-#     qr = QuadratureRule{Ferrite.getrefshape(ip_g)}(Ferrite.getorder(ip_g)+1)
-#     cv = CellValues(qr, ip, ip_g) # TODO replace with GPUCellValues
-#     return PEALinearOperator(
-#         zeros(ndofs(dh)),
-#         AnalyticalCoefficientElementCache(
-#             protocol.f,
-#             protocol.nonzero_intervals,
-#             cv,
-#         ),
-#         dh,
-#     )
-# end
+# ---------------------- Generic part ------------------------
 
 # Pointwise cuda solver wrapper
 function _gpu_pointwise_step_inner_kernel_wrapper!(f, t, Δt, cache::AbstractPointwiseSolverCache)
@@ -99,8 +58,8 @@ function Thunderbolt._pointwise_step_outer_kernel!(f::AbstractPointwiseFunction,
     return true
 end
 
-_allocate_matrix(dh::GPUDofHandler, A::SparseMatrixCSR, ::CuVector) = CuSparseMatrixCSR(A)
-_allocate_matrix(dh::GPUDofHandler, A::SparseMatrixCSC, ::CuVector) = CuSparseMatrixCSC(A)
+_allocate_matrix(dh::DeviceDofHandler, A::SparseMatrixCSR, ::CuVector) = CuSparseMatrixCSR(A)
+_allocate_matrix(dh::DeviceDofHandler, A::SparseMatrixCSC, ::CuVector) = CuSparseMatrixCSC(A)
 
 Thunderbolt.create_system_vector(::Type{<:CuVector{T}}, f::AbstractSemidiscreteFunction) where T = CUDA.zeros(T, solution_size(f))
 Thunderbolt.create_system_vector(::Type{<:CuVector{T}}, dh::DofHandler) where T                  = CUDA.zeros(T, ndofs(dh))
@@ -121,4 +80,9 @@ function Thunderbolt.adapt_vector_type(::Type{<:CuVector}, v::VT) where {VT <: V
     return CuVector(v)
 end
 
+include("cuda/cuda_operator.jl")
+include("cuda/cuda_memalloc.jl")
+include("cuda/cuda_adapt.jl")
+include("cuda/cuda_iterator.jl")
+
 end
diff --git a/ext/cuda/cuda_adapt.jl b/ext/cuda/cuda_adapt.jl
@@ -0,0 +1,67 @@
+###################
+## adapt Buffers ##
+###################
+
+@adapt_structure KeFeGlobalMem
+@adapt_structure FeGlobalMem
+@adapt_structure KeGlobalMem
+
+#####################################
+## Shallow adaption for DofHandler ##
+#####################################
+function Adapt.adapt_structure(strategy::CudaAssemblyStrategy, dh::DofHandler)
+    IT = inttype(strategy)
+    grid = _adapt(strategy, dh.grid)
+    cell_dofs = dh.cell_dofs .|> (i -> convert(IT,i)) |> cu
+    cell_dofs_offset = dh.cell_dofs_offset .|> (i -> convert(IT,i)) |> cu
+    cell_to_sdh = dh.cell_to_subdofhandler .|> (i -> convert(IT,i)) |> cu
+    dh_data = DeviceDofHandlerData(
+        grid,
+        cell_dofs,
+        cell_dofs_offset,
+        cell_to_sdh,
+        convert(IT,dh.ndofs))
+    subdofhandlers = dh.subdofhandlers .|> (sdh -> _adapt(strategy, sdh,dh_data)) 
+    return DeviceDofHandler(dh,subdofhandlers)
+end
+
+_symbols_to_int(symbols,IT::Type) = 1:length(symbols) .|> (sym -> convert(IT, sym))
+
+
+function _adapt(strategy::CudaAssemblyStrategy, sdh::SubDofHandler,dh_data::DeviceDofHandlerData)
+    IT = inttype(strategy)
+    cellset =  sdh.cellset |> collect .|> (x -> convert(IT, x)) |> cu 
+    field_names =  _symbols_to_int(sdh.field_names,IT) |> cu 
+    field_interpolations = sdh.field_interpolations |> convert_vec_to_concrete |> cu 
+    ndofs_per_cell =  sdh.ndofs_per_cell
+    return DeviceSubDofHandler(cellset, field_names, field_interpolations, ndofs_per_cell,dh_data)
+end
+
+function _adapt(::CudaAssemblyStrategy, grid::Grid{sdim, cell_type, T}) where {sdim, cell_type, T}
+    node_type = typeof(first(grid.nodes))
+    cells =  grid.cells |> convert_vec_to_concrete  |> cu
+    nodes =  grid.nodes |> cu
+    #TODO subdomain info
+    return DeviceGrid{sdim, cell_type, T, typeof(cells), typeof(nodes)}(cells, nodes)
+end
+
+######################
+## adapt Coefficients ##
+######################
+function Adapt.adapt_structure(::CudaAssemblyStrategy, element_cache::AnalyticalCoefficientElementCache)
+    cc = adapt_structure(CuArray, element_cache.cc)
+    nz_intervals = adapt(CuArray, element_cache.nonzero_intervals )
+    sv = adapt_structure(CuArray, element_cache.cv)
+    return AnalyticalCoefficientElementCache(cc, nz_intervals, sv)
+end
+
+function Adapt.adapt_structure(::CudaAssemblyStrategy, cysc::FieldCoefficientCache)
+    elementwise_data = adapt(CuArray, cysc.elementwise_data)
+    cv = adapt_structure(CuArray, cysc.cv)
+    return FieldCoefficientCache(elementwise_data, cv)
+end
+function Adapt.adapt_structure(::CudaAssemblyStrategy, sphdf::SpatiallyHomogeneousDataField)
+    timings = adapt(CuArray, sphdf.timings )
+    data = adapt(CuArray, sphdf.data )
+    return SpatiallyHomogeneousDataField(timings, data)
+end