Skip to content

Commit

Permalink
init fix dh adapt error
Browse files Browse the repository at this point in the history
  • Loading branch information
Abdelrahman912 committed Feb 5, 2025
1 parent d4d3bab commit 23f5413
Show file tree
Hide file tree
Showing 13 changed files with 305 additions and 291 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,4 +60,4 @@ Tensors = "48a634ad-e948-5137-8d70-aa71f2a747f4"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Aqua", "DelimitedFiles", "Pkg", "StaticArrays", "Tensors", "Test"]
test = ["Aqua", "DelimitedFiles", "Pkg", "StaticArrays", "Tensors", "Test"]
5 changes: 3 additions & 2 deletions ext/CuThunderboltExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ using Thunderbolt
import CUDA:
CUDA, CuArray, CuVector, CUSPARSE,blockDim,blockIdx,gridDim,threadIdx,
threadIdx, blockIdx, blockDim, @cuda, @cushow,
CUDABackend, launch_configuration, device, cu
CUDABackend, launch_configuration, device, cu,cudaconvert

import Thunderbolt:
UnPack.@unpack,
Expand All @@ -15,7 +15,8 @@ import Thunderbolt:
AbstractPointwiseSolverCache,assemble_element!,
LinearOperator,AbstractOperatorKernel,QuadratureRuleCollection,
AnalyticalCoefficientElementCache,AnalyticalCoefficientCache,CartesianCoordinateSystemCache,
setup_element_cache,update_operator!,init_linear_operator,FieldCoefficientCache, CudaAssemblyStrategy, floattype,inttype
setup_element_cache,update_operator!,init_linear_operator,FieldCoefficientCache, CudaAssemblyStrategy, floattype,inttype,
convert_vec_to_concrete

import Thunderbolt.FerriteUtils:
StaticInterpolationValues,StaticCellValues, try_allocate_shared_mem,
Expand Down
42 changes: 20 additions & 22 deletions ext/cuda/cuda_adapt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,43 +21,41 @@ function _convert_subdofhandler_to_gpu(cell_dofs, cell_dof_soffset, sdh::SubDofH
)
end

function Adapt.adapt_structure(to, dh::DofHandler{sdim}) where sdim
grid = adapt_structure(to, dh.grid)
# field_names = Tuple(sym for sym in dh.field_names)
#IndexType = eltype(dh.cell_dofs)
#IndexVectorType = CuVector{IndexType}
cell_dofs = adapt(to, dh.cell_dofs .|> (i -> convert(Int32,i)) |> cu) # currently you cant create Dofhandler with Int32
cell_dofs_offset = adapt(to, dh.cell_dofs_offset .|> (i -> convert(Int32,i)) |> cu)
cell_to_sdh = adapt(to, dh.cell_to_subdofhandler .|> (i -> convert(Int32,i)) |> cu)
#subdofhandlers = Tuple(i->_convert_subdofhandler_to_gpu(cell_dofs, cell_dofs_offset, sdh) for sdh in dh.subdofhandlers)
subdofhandlers = adapt_structure(to,dh.subdofhandlers .|> (sdh -> Adapt.adapt_structure(to, sdh)) |> cu)

function Adapt.adapt_structure(strategy::CudaAssemblyStrategy, dh::DofHandler)
IT = inttype(strategy)
grid = _adapt(strategy, dh.grid)
cell_dofs = dh.cell_dofs .|> (i -> convert(IT,i)) |> cu
cell_dofs_offset = dh.cell_dofs_offset .|> (i -> convert(IT,i)) |> cu
cell_to_sdh = dh.cell_to_subdofhandler .|> (i -> convert(IT,i)) |> cu
subdofhandlers = dh.subdofhandlers .|> (sdh -> _adapt(strategy, sdh)) |> cu |> cudaconvert
gpudata = DeviceDofHandlerData(
grid,
subdofhandlers,
# field_names,
cell_dofs,
cell_dofs_offset,
cell_to_sdh,
convert(Int32,dh.ndofs),
convert(IT,dh.ndofs),
)
#return DeviceDofHandler(dh, gpudata)
return DeviceDofHandler(gpudata)
end

_symbols_to_int32(symbols) = 1:length(symbols) .|> (sym -> convert(Int32, sym))
_symbols_to_int(symbols,IT::Type) = 1:length(symbols) .|> (sym -> convert(IT, sym))


function Adapt.adapt_structure(to, sdh::SubDofHandler)
cellset = Adapt.adapt_structure(to, sdh.cellset |> collect .|> (x -> convert(Int32, x)) |> cu)
field_names = Adapt.adapt_structure(to, _symbols_to_int32(sdh.field_names) |> cu)
field_interpolations = sdh.field_interpolations .|> (ip -> Adapt.adapt_structure(to, ip)) |> cu
ndofs_per_cell = Adapt.adapt_structure(to, sdh.ndofs_per_cell)
function _adapt(strategy::CudaAssemblyStrategy, sdh::SubDofHandler)
IT = inttype(strategy)
cellset = sdh.cellset |> collect .|> (x -> convert(IT, x)) |> cu |> cudaconvert
field_names = _symbols_to_int(sdh.field_names,IT) |> cu |> cudaconvert
field_interpolations = sdh.field_interpolations |> convert_vec_to_concrete |> cu |> cudaconvert
ndofs_per_cell = sdh.ndofs_per_cell
return DeviceSubDofHandlerData(cellset, field_names, field_interpolations, ndofs_per_cell)
end

function Adapt.adapt_structure(to, grid::Grid{sdim, cell_type, T}) where {sdim, cell_type, T}
function _adapt(strategy::CudaAssemblyStrategy, grid::Grid{sdim, cell_type, T}) where {sdim, cell_type, T}
node_type = typeof(first(grid.nodes))
cells = Adapt.adapt_structure(to, grid.cells .|> (x -> Int32.(x.nodes)) .|> eltype(grid.cells) |> cu)
nodes = Adapt.adapt_structure(to, grid.nodes |> cu)
cells = grid.cells |> convert_vec_to_concrete |> cu
nodes = grid.nodes |> cu
#TODO subdomain info
return DeviceGrid{sdim, cell_type, T, typeof(cells), typeof(nodes)}(cells, nodes)
end
Expand Down
10 changes: 6 additions & 4 deletions ext/cuda/cuda_iterator.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ function _cell_iterator(dh::DeviceDofHandlerData,sdh_idx::Integer,n_cells::Integ
global_thread_id = (blockIdx().x - Int32(1)) * bd + local_thread_id
global_thread_id <= n_cells || return DeviceOutOfBoundCellIterator()
cell_mem = NoCellMem()
return DeviceCellIterator(dh, grid, n_cells, cell_mem ,sdh_idx)
return DeviceCellIterator(dh, grid, n_cells, cell_mem ,sdh_idx)
end

# iterator with global memory allocation
Expand Down Expand Up @@ -89,18 +89,20 @@ Base.iterate(::DeviceOutOfBoundCellIterator, state) = nothing # I believe this i

# cuda cell cache #
function _makecache(iterator::AbstractDeviceCellIterator, e::Ti) where {Ti <: Integer}
# NOTE: e is the global thread index, so in case of iterating over the whole cells in the grid, e is the cell index
# whereas in the case of we are only iterating over a subdomain, e is just the iterator index from 1:n_cells in subdomain
dh = iterator.dh
grid = iterator.grid
sdh_idx = iterator.sdh_idx
cellid = e
sdh_idx <= 0 || (cellid = dh.subdofhandlers[sdh_idx].cellset[e])
cell = Ferrite.getcells(grid, e)
cell = Ferrite.getcells(grid, cellid)

# Extract the node IDs of the cell.
#Extract the node IDs of the cell.
nodes = SVector(convert.(Ti, Ferrite.get_node_ids(cell))...)

# Extract the degrees of freedom for the cell.
dofs = Ferrite.celldofs(dh, e)
dofs = Ferrite.celldofs(dh, cellid)

# Get the coordinates of the nodes of the cell.
CT = Ferrite.get_coordinate_type(grid)
Expand Down
2 changes: 1 addition & 1 deletion ext/cuda/cuda_operator.jl
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ function _init_linop_cuda(linop::LinearOperator,strategy::CudaAssemblyStrategy)
blocks = _calculate_nblocks(threads, n_cells)
n_basefuncs = convert(IT,ndofs_per_cell(dh))
eles_caches = _setup_caches(linop)
device_dh = adapt_structure(CUDA.KernelAdaptor(), dh)
device_dh = Adapt.adapt_structure(strategy, dh)
mem_alloc = try_allocate_shared_mem(FeMemShape{FT}, threads, n_basefuncs)
mem_alloc isa Nothing || return CudaOperatorKernel(linop, threads, blocks, mem_alloc,eles_caches,device_dh)

Expand Down
4 changes: 2 additions & 2 deletions src/Thunderbolt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ import GPUArraysCore: AbstractGPUVector, AbstractGPUArray
import Adapt:
Adapt, adapt_structure, adapt


include("utils.jl")

include("mesh/meshes.jl")
Expand Down Expand Up @@ -91,8 +92,7 @@ include("modeling/rsafdq2022.jl")
include("discretization/rsafdq-operator.jl")


## GPU stuf ##
include("gpu/assemble_strategy.jl")
include("gpu/gpu_utils.jl")
include("gpu/gpu_operator.jl")


Expand Down
1 change: 1 addition & 0 deletions src/ferrite-addons/gpu/adapt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ function Adapt.adapt_structure(to, cv::CellValues)
return StaticCellValues(fv, gm, weights)
end


53 changes: 3 additions & 50 deletions src/ferrite-addons/gpu/device_dofhandler.jl
Original file line number Diff line number Diff line change
@@ -1,74 +1,27 @@

# Utility which holds partial information for assembly.
# # Utility which holds partial information for assembly.
struct DeviceSubDofHandlerData{VEC_IP,IndexType, IndexVectorType <: AbstractVector{IndexType},Ti<:Integer} <: Ferrite.AbstractDofHandler
# Relevant fields from GPUDofHandler
#cell_dofs::IndexVectorType # why we need this?
#cell_dofs_offset::IndexVectorType # why we need this?
# Flattened cellset
cellset::IndexVectorType
field_names::IndexVectorType
field_interpolations::VEC_IP
field_interpolations::VEC_IP
ndofs_per_cell::Ti
end


# Utility which holds partial information for assembly.
struct DeviceDofHandlerData{sdim, G<:Ferrite.AbstractGrid{sdim}, #=nfields,=# SDHTupleType, IndexType, IndexVectorType <: AbstractVector{IndexType},Ti<: Integer} <: Ferrite.AbstractDofHandler
grid::G
subdofhandlers::SDHTupleType
# field_names::SVector{Symbol, nfields}
cell_dofs::IndexVectorType
cell_dofs_offset::IndexVectorType
cell_to_subdofhandler::IndexVectorType
# grid::G # We do not need this explicitly on the GPU
ndofs::Ti
end

function Base.show(io::IO, mime::MIME"text/plain", data::DeviceDofHandlerData{sdim}) where sdim
_show(io, mime, data, 0)
end

function _show(io::IO, mime::MIME"text/plain", data::DeviceDofHandlerData{sdim}, indent::Int) where sdim
offset = " "^indent
println(io, offset, "DeviceDofHandlerData{sdim=", sdim, "}")
_show(io, mime, data.grid, indent+2)
println(io, offset, " SubDofHandlers: ", length(data.subdofhandlers))
end

# struct GPUDofHandler{DHType <: Ferrite.AbstractDofHandler, GPUDataType} <: Ferrite.AbstractDofHandler
# #dh::DHType #Why do we need this? already all info is in gpudata
# gpudata::GPUDataType
# end
struct DeviceDofHandler{ GPUDataType} <: Ferrite.AbstractDofHandler
#dh::DHType #Why do we need this? already all info is in gpudata
gpudata::GPUDataType
end

function Base.show(io::IO, mime::MIME"text/plain", dh_::DeviceDofHandler)
dh = dh_.dh
println(io, "GPUDofHandler")
if length(dh.subdofhandlers) == 1
Ferrite._print_field_information(io, mime, dh.subdofhandlers[1])
else
println(io, " Fields:")
for fieldname in getfieldnames(dh)
ip = getfieldinterpolation(dh, find_field(dh, fieldname))
if ip isa ScalarInterpolation
field_type = "scalar"
elseif ip isa VectorInterpolation
_getvdim(::VectorInterpolation{vdim}) where vdim = vdim
field_type = "Vec{$(_getvdim(ip))}"
end
println(io, " ", repr(fieldname), ", ", field_type)
end
end
if !Ferrite.isclosed(dh)
println(io, " Not closed!")
else
println(io, " Total dofs: ", ndofs(dh))
end
_show(io, mime, dh_.gpudata, 2)
end

Ferrite.isclosed(::DeviceDofHandler) = true

Ferrite.allocate_matrix(dh::DeviceDofHandler) = _allocate_matrix(dh, allocate_matrix(dh.dh), dh.gpudata.cellset)
Expand Down
23 changes: 0 additions & 23 deletions src/ferrite-addons/gpu/device_grid.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,29 +12,6 @@ function DeviceGrid(
return DeviceGrid{dim,C,T, CellDataType, NodeDataType}(cells, nodes)
end

function Base.show(io::IO, mime::MIME"text/plain", data::DeviceGrid)
_show(io, mime, data, 0)
end

function _show(io::IO, mime::MIME"text/plain", grid::DeviceGrid{sdim, C, T}, indent) where {sdim, C, T}
offset = " "^indent
print(io, offset, "DeviceGrid{sdim=", sdim, ", T=", T, "}")
if isconcretetype(eltype(grid.cells))
typestrs = [repr(eltype(grid.cells))]
else
typestrs = sort!(repr.(OrderedSet(typeof(x) for x in grid.cells)))
end
print(io, " with ")
join(io, typestrs, '/')
println(io, " cells and $(length(grid.nodes)) nodes")
end

# commented out because SimpleMesh is not defined in FerriteUtils module.
# function Adapt.adapt_structure(to, grid::SimpleMesh)
# return adapt_structure(to, grid.grid)
# end


Ferrite.get_coordinate_type(::DeviceGrid{sdim, <:Any, T,<:Any,<:Any}) where {sdim, T} = Vec{sdim, T} # Node is baked into the mesh type.

@inline Ferrite.getcells(grid::DeviceGrid, v::Ti) where {Ti <: Integer} = grid.cells[v]
Expand Down
13 changes: 0 additions & 13 deletions src/gpu/assemble_strategy.jl

This file was deleted.

41 changes: 41 additions & 0 deletions src/gpu/gpu_utils.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# in subdofhandlers and in grid there are some vectors that incorporate many objects of different types but they
# only share the same abstract type.
# one way to solve this (not the optimal way memory wise) is to convert the vector to a vector of Union types if
# there are multiple concrete types in the vector , otherwise convert the vector to a vector of the concrete type.
function convert_vec_to_concrete(vec::Vector)
# Get all unique concrete types in the vector
Ts = unique(typeof.(vec))

if length(Ts) == 1
# All elements are the same concrete type
T = Ts[1]
return collect(T, vec)
else
# Create a union of all observed concrete types
U = Union{Ts...}
return collect(U, vec)
end
end


#######################
## Assembly Strategy ##
#######################
abstract type AbstractAssemblyStrategy end

# encompass the all the required data types that needs to be worked with on the GPU
struct CudaAssemblyStrategy <: AbstractAssemblyStrategy
floattype::Type
inttype::Type
end

floattype(strategy::CudaAssemblyStrategy) = strategy.floattype
inttype(strategy::CudaAssemblyStrategy) = strategy.inttype

CudaDefaultAssemblyStrategy() = CudaAssemblyStrategy(Float32, Int32)



function Adapt.adapt_structure(::AbstractAssemblyStrategy, dh::DofHandler)
error("GPU specific implementation for `adapt_structure(to,dh::DofHandler)` is not implemented yet")
end
Loading

0 comments on commit 23f5413

Please sign in to comment.