init fix dh adapt error

Abdelrahman912 · Feb 5, 2025 · 23f5413 · 23f5413
1 parent d4d3bab
commit 23f5413
Show file tree

Hide file tree

Showing 13 changed files with 305 additions and 291 deletions.
diff --git a/Project.toml b/Project.toml
@@ -60,4 +60,4 @@ Tensors = "48a634ad-e948-5137-8d70-aa71f2a747f4"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Aqua", "DelimitedFiles", "Pkg", "StaticArrays", "Tensors", "Test"]
+test = ["Aqua", "DelimitedFiles", "Pkg", "StaticArrays", "Tensors", "Test"]
diff --git a/ext/CuThunderboltExt.jl b/ext/CuThunderboltExt.jl
@@ -5,7 +5,7 @@ using Thunderbolt
 import CUDA:
     CUDA, CuArray, CuVector, CUSPARSE,blockDim,blockIdx,gridDim,threadIdx,
     threadIdx, blockIdx, blockDim, @cuda, @cushow,
-    CUDABackend, launch_configuration, device, cu
+    CUDABackend, launch_configuration, device, cu,cudaconvert
 
 import Thunderbolt:
     UnPack.@unpack,
@@ -15,7 +15,8 @@ import Thunderbolt:
     AbstractPointwiseSolverCache,assemble_element!,
     LinearOperator,AbstractOperatorKernel,QuadratureRuleCollection,
     AnalyticalCoefficientElementCache,AnalyticalCoefficientCache,CartesianCoordinateSystemCache,
-    setup_element_cache,update_operator!,init_linear_operator,FieldCoefficientCache, CudaAssemblyStrategy, floattype,inttype
+    setup_element_cache,update_operator!,init_linear_operator,FieldCoefficientCache, CudaAssemblyStrategy, floattype,inttype, 
+    convert_vec_to_concrete
 
 import Thunderbolt.FerriteUtils:
     StaticInterpolationValues,StaticCellValues, try_allocate_shared_mem,

diff --git a/ext/cuda/cuda_adapt.jl b/ext/cuda/cuda_adapt.jl
@@ -21,43 +21,41 @@ function _convert_subdofhandler_to_gpu(cell_dofs, cell_dof_soffset, sdh::SubDofH
     )
 end
 
-function Adapt.adapt_structure(to, dh::DofHandler{sdim}) where sdim
-    grid             = adapt_structure(to, dh.grid)
-    # field_names      = Tuple(sym for sym in dh.field_names)
-    #IndexType        = eltype(dh.cell_dofs)
-    #IndexVectorType  = CuVector{IndexType}
-    cell_dofs        = adapt(to, dh.cell_dofs .|> (i -> convert(Int32,i)) |> cu) # currently you cant create Dofhandler with Int32
-    cell_dofs_offset = adapt(to, dh.cell_dofs_offset .|> (i -> convert(Int32,i)) |> cu)
-    cell_to_sdh      = adapt(to, dh.cell_to_subdofhandler .|> (i -> convert(Int32,i)) |> cu)
-    #subdofhandlers   = Tuple(i->_convert_subdofhandler_to_gpu(cell_dofs, cell_dofs_offset, sdh) for sdh in dh.subdofhandlers)
-    subdofhandlers   = adapt_structure(to,dh.subdofhandlers .|> (sdh -> Adapt.adapt_structure(to, sdh)) |> cu)
+
+function Adapt.adapt_structure(strategy::CudaAssemblyStrategy, dh::DofHandler)
+    IT = inttype(strategy)
+    grid = _adapt(strategy, dh.grid)
+    cell_dofs = dh.cell_dofs .|> (i -> convert(IT,i)) |> cu
+    cell_dofs_offset = dh.cell_dofs_offset .|> (i -> convert(IT,i)) |> cu
+    cell_to_sdh = dh.cell_to_subdofhandler .|> (i -> convert(IT,i)) |> cu
+    subdofhandlers = dh.subdofhandlers .|> (sdh -> _adapt(strategy, sdh)) |> cu |> cudaconvert
     gpudata = DeviceDofHandlerData(
         grid,
         subdofhandlers,
-        # field_names,
         cell_dofs,
         cell_dofs_offset,
         cell_to_sdh,
-        convert(Int32,dh.ndofs),
+        convert(IT,dh.ndofs),
     )
-    #return DeviceDofHandler(dh, gpudata)
     return DeviceDofHandler(gpudata)
 end
 
-_symbols_to_int32(symbols) = 1:length(symbols) .|> (sym -> convert(Int32, sym))
+_symbols_to_int(symbols,IT::Type) = 1:length(symbols) .|> (sym -> convert(IT, sym))
+
 
-function Adapt.adapt_structure(to, sdh::SubDofHandler)
-    cellset = Adapt.adapt_structure(to, sdh.cellset |> collect .|> (x -> convert(Int32, x)) |> cu)
-    field_names = Adapt.adapt_structure(to, _symbols_to_int32(sdh.field_names) |> cu)
-    field_interpolations = sdh.field_interpolations .|> (ip -> Adapt.adapt_structure(to, ip)) |> cu
-    ndofs_per_cell = Adapt.adapt_structure(to, sdh.ndofs_per_cell)
+function _adapt(strategy::CudaAssemblyStrategy, sdh::SubDofHandler)
+    IT = inttype(strategy)
+    cellset =  sdh.cellset |> collect .|> (x -> convert(IT, x)) |> cu |> cudaconvert
+    field_names =  _symbols_to_int(sdh.field_names,IT) |> cu |> cudaconvert
+    field_interpolations = sdh.field_interpolations |> convert_vec_to_concrete |> cu |> cudaconvert
+    ndofs_per_cell =  sdh.ndofs_per_cell
     return DeviceSubDofHandlerData(cellset, field_names, field_interpolations, ndofs_per_cell)
 end
 
-function Adapt.adapt_structure(to, grid::Grid{sdim, cell_type, T}) where {sdim, cell_type, T}
+function _adapt(strategy::CudaAssemblyStrategy, grid::Grid{sdim, cell_type, T}) where {sdim, cell_type, T}
     node_type = typeof(first(grid.nodes))
-    cells = Adapt.adapt_structure(to, grid.cells .|> (x -> Int32.(x.nodes)) .|> eltype(grid.cells) |> cu)
-    nodes = Adapt.adapt_structure(to, grid.nodes |> cu)
+    cells =  grid.cells |> convert_vec_to_concrete  |> cu
+    nodes =  grid.nodes |> cu
     #TODO subdomain info
     return DeviceGrid{sdim, cell_type, T, typeof(cells), typeof(nodes)}(cells, nodes)
 end

diff --git a/ext/cuda/cuda_iterator.jl b/ext/cuda/cuda_iterator.jl
@@ -12,7 +12,7 @@ function _cell_iterator(dh::DeviceDofHandlerData,sdh_idx::Integer,n_cells::Integ
     global_thread_id = (blockIdx().x - Int32(1)) * bd + local_thread_id
     global_thread_id <= n_cells || return DeviceOutOfBoundCellIterator()
     cell_mem = NoCellMem()
-    return DeviceCellIterator(dh, grid, n_cells, cell_mem ,sdh_idx)
+    return  DeviceCellIterator(dh, grid, n_cells, cell_mem ,sdh_idx)
 end
 
 # iterator with global memory allocation
@@ -89,18 +89,20 @@ Base.iterate(::DeviceOutOfBoundCellIterator, state) = nothing # I believe this i
 
 # cuda cell cache #
 function _makecache(iterator::AbstractDeviceCellIterator, e::Ti) where {Ti <: Integer}
+    # NOTE: e is the global thread index, so in case of iterating over the whole cells in the grid, e is the cell index
+    # whereas in the case of we are only iterating over a subdomain, e is just the iterator index from 1:n_cells in subdomain
     dh = iterator.dh
     grid = iterator.grid
     sdh_idx = iterator.sdh_idx
     cellid = e
     sdh_idx <= 0 || (cellid = dh.subdofhandlers[sdh_idx].cellset[e])
-    cell = Ferrite.getcells(grid, e)
+    cell = Ferrite.getcells(grid, cellid)
 
-    # Extract the node IDs of the cell.
+    #Extract the node IDs of the cell.
     nodes = SVector(convert.(Ti, Ferrite.get_node_ids(cell))...)
 
     # Extract the degrees of freedom for the cell.
-    dofs = Ferrite.celldofs(dh, e)
+    dofs = Ferrite.celldofs(dh, cellid)
 
     # Get the coordinates of the nodes of the cell.
     CT = Ferrite.get_coordinate_type(grid)

diff --git a/ext/cuda/cuda_operator.jl b/ext/cuda/cuda_operator.jl
@@ -40,7 +40,7 @@ function _init_linop_cuda(linop::LinearOperator,strategy::CudaAssemblyStrategy)
     blocks = _calculate_nblocks(threads, n_cells)
     n_basefuncs = convert(IT,ndofs_per_cell(dh)) 
     eles_caches = _setup_caches(linop)
-    device_dh = adapt_structure(CUDA.KernelAdaptor(), dh)
+    device_dh = Adapt.adapt_structure(strategy, dh)
     mem_alloc = try_allocate_shared_mem(FeMemShape{FT}, threads, n_basefuncs)
     mem_alloc isa Nothing || return CudaOperatorKernel(linop, threads, blocks, mem_alloc,eles_caches,device_dh)
 

diff --git a/src/Thunderbolt.jl b/src/Thunderbolt.jl
@@ -46,6 +46,7 @@ import GPUArraysCore: AbstractGPUVector, AbstractGPUArray
 import Adapt:
     Adapt, adapt_structure, adapt
 
+
 include("utils.jl")
 
 include("mesh/meshes.jl")
@@ -91,8 +92,7 @@ include("modeling/rsafdq2022.jl")
 include("discretization/rsafdq-operator.jl")
 
 
-## GPU stuf ##
-include("gpu/assemble_strategy.jl")
+include("gpu/gpu_utils.jl")
 include("gpu/gpu_operator.jl")
 
 

diff --git a/src/ferrite-addons/gpu/adapt.jl b/src/ferrite-addons/gpu/adapt.jl
@@ -17,3 +17,4 @@ function Adapt.adapt_structure(to, cv::CellValues)
     return StaticCellValues(fv, gm, weights)
 end
 
+
diff --git a/src/ferrite-addons/gpu/device_dofhandler.jl b/src/ferrite-addons/gpu/device_dofhandler.jl
@@ -1,74 +1,27 @@
 
-# Utility which holds partial information for assembly.
+# # Utility which holds partial information for assembly.
 struct DeviceSubDofHandlerData{VEC_IP,IndexType, IndexVectorType <: AbstractVector{IndexType},Ti<:Integer} <: Ferrite.AbstractDofHandler
-    # Relevant fields from GPUDofHandler
-    #cell_dofs::IndexVectorType # why we need this?
-    #cell_dofs_offset::IndexVectorType # why we need this?
-    # Flattened cellset
     cellset::IndexVectorType
     field_names::IndexVectorType
-    field_interpolations::VEC_IP
+    field_interpolations::VEC_IP 
     ndofs_per_cell::Ti
 end
 
+
 # Utility which holds partial information for assembly.
 struct DeviceDofHandlerData{sdim, G<:Ferrite.AbstractGrid{sdim}, #=nfields,=# SDHTupleType, IndexType, IndexVectorType <: AbstractVector{IndexType},Ti<: Integer} <: Ferrite.AbstractDofHandler
     grid::G
     subdofhandlers::SDHTupleType
-    # field_names::SVector{Symbol, nfields}
     cell_dofs::IndexVectorType
     cell_dofs_offset::IndexVectorType
     cell_to_subdofhandler::IndexVectorType
-    # grid::G # We do not need this explicitly on the GPU
     ndofs::Ti
 end
 
-function Base.show(io::IO, mime::MIME"text/plain", data::DeviceDofHandlerData{sdim}) where sdim
-    _show(io, mime, data, 0)
-end
-
-function _show(io::IO, mime::MIME"text/plain", data::DeviceDofHandlerData{sdim}, indent::Int) where sdim
-    offset = " "^indent
-    println(io, offset, "DeviceDofHandlerData{sdim=", sdim, "}")
-    _show(io, mime, data.grid, indent+2)
-    println(io, offset, "  SubDofHandlers: ", length(data.subdofhandlers))
-end
-
-# struct GPUDofHandler{DHType <: Ferrite.AbstractDofHandler, GPUDataType} <: Ferrite.AbstractDofHandler
-#     #dh::DHType #Why do we need this? already all info is in gpudata
-#     gpudata::GPUDataType
-# end
 struct DeviceDofHandler{ GPUDataType} <: Ferrite.AbstractDofHandler
-    #dh::DHType #Why do we need this? already all info is in gpudata
     gpudata::GPUDataType
 end
 
-function Base.show(io::IO, mime::MIME"text/plain", dh_::DeviceDofHandler)
-    dh = dh_.dh
-    println(io, "GPUDofHandler")
-    if length(dh.subdofhandlers) == 1
-        Ferrite._print_field_information(io, mime, dh.subdofhandlers[1])
-    else
-        println(io, "  Fields:")
-        for fieldname in getfieldnames(dh)
-            ip = getfieldinterpolation(dh, find_field(dh, fieldname))
-            if ip isa ScalarInterpolation
-                field_type = "scalar"
-            elseif ip isa VectorInterpolation
-                _getvdim(::VectorInterpolation{vdim}) where vdim = vdim
-                field_type = "Vec{$(_getvdim(ip))}"
-            end
-            println(io, "    ", repr(fieldname), ", ", field_type)
-        end
-    end
-    if !Ferrite.isclosed(dh)
-        println(io, "  Not closed!")
-    else
-        println(io, "  Total dofs: ", ndofs(dh))
-    end
-    _show(io, mime, dh_.gpudata, 2)
-end
-
 Ferrite.isclosed(::DeviceDofHandler) = true
 
 Ferrite.allocate_matrix(dh::DeviceDofHandler) = _allocate_matrix(dh, allocate_matrix(dh.dh), dh.gpudata.cellset)

diff --git a/src/ferrite-addons/gpu/device_grid.jl b/src/ferrite-addons/gpu/device_grid.jl
@@ -12,29 +12,6 @@ function DeviceGrid(
 return DeviceGrid{dim,C,T, CellDataType, NodeDataType}(cells, nodes)
 end
 
-function Base.show(io::IO, mime::MIME"text/plain", data::DeviceGrid)
-    _show(io, mime, data, 0)
-end
-
-function _show(io::IO, mime::MIME"text/plain", grid::DeviceGrid{sdim, C, T}, indent) where {sdim, C, T}
-    offset = " "^indent
-    print(io, offset, "DeviceGrid{sdim=", sdim, ", T=", T, "}")
-    if isconcretetype(eltype(grid.cells))
-        typestrs = [repr(eltype(grid.cells))]
-    else
-        typestrs = sort!(repr.(OrderedSet(typeof(x) for x in grid.cells)))
-    end
-    print(io, " with ")
-    join(io, typestrs, '/')
-    println(io, " cells and $(length(grid.nodes)) nodes")
-end
-
-# commented out because SimpleMesh is not defined in FerriteUtils module.
-# function Adapt.adapt_structure(to, grid::SimpleMesh)
-#     return adapt_structure(to, grid.grid)
-# end
-
-
 Ferrite.get_coordinate_type(::DeviceGrid{sdim, <:Any, T,<:Any,<:Any}) where {sdim, T} = Vec{sdim, T} # Node is baked into the mesh type.
 
 @inline Ferrite.getcells(grid::DeviceGrid, v::Ti) where {Ti <: Integer} = grid.cells[v]

diff --git a/src/gpu/assemble_strategy.jl b/src/gpu/assemble_strategy.jl
diff --git a/src/gpu/gpu_utils.jl b/src/gpu/gpu_utils.jl
@@ -0,0 +1,41 @@
+# in subdofhandlers and in grid there are some vectors that incorporate many objects of different types but they 
+# only share the same abstract type. 
+# one way to solve this (not the optimal way memory wise) is to convert the vector to a vector of Union types if
+# there are multiple concrete types in the vector , otherwise convert the vector to a vector of the concrete type.
+function convert_vec_to_concrete(vec::Vector)
+    # Get all unique concrete types in the vector
+    Ts = unique(typeof.(vec))
+
+    if length(Ts) == 1
+        # All elements are the same concrete type
+        T = Ts[1]
+        return collect(T, vec)
+    else
+        # Create a union of all observed concrete types
+        U = Union{Ts...}
+        return collect(U, vec)
+    end
+end
+
+
+#######################
+## Assembly Strategy ##
+#######################
+abstract type AbstractAssemblyStrategy end
+
+# encompass the all the required data types that needs to be worked with on the GPU
+struct CudaAssemblyStrategy <: AbstractAssemblyStrategy
+    floattype::Type
+    inttype::Type
+end
+
+floattype(strategy::CudaAssemblyStrategy) = strategy.floattype
+inttype(strategy::CudaAssemblyStrategy) = strategy.inttype
+
+CudaDefaultAssemblyStrategy() = CudaAssemblyStrategy(Float32, Int32)
+
+
+
+function Adapt.adapt_structure(::AbstractAssemblyStrategy, dh::DofHandler)
+    error("GPU specific implementation for `adapt_structure(to,dh::DofHandler)` is not implemented yet")
+ end
Original file line number	Diff line number	Diff line change
Expand Up		@@ -17,3 +17,4 @@ function Adapt.adapt_structure(to, cv::CellValues)
		return StaticCellValues(fv, gm, weights)
		end