Skip to content

Commit 0dbaf72

Browse files
S-D-Rmaleadt
authored andcommitted
Add support for constant memory.
1 parent d4576f1 commit 0dbaf72

File tree

7 files changed

+507
-2
lines changed

7 files changed

+507
-2
lines changed

lib/cudadrv/module/global.jl

+33-2
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@
44
# - should be more dict-like: get and setindex(::name), haskey(::name)
55
# - globals(::Type)?
66

7-
export
8-
CuGlobal, get, set
7+
export CuGlobal, get, set, CuGlobalArray
98

109

1110
"""
@@ -62,3 +61,35 @@ function Base.setindex!(var::CuGlobal{T}, val::T) where T
6261
val_ref = Ref{T}(val)
6362
cuMemcpyHtoD_v2(var, val_ref, var.buf.bytesize)
6463
end
64+
65+
"""
66+
CuGlobalArray{T}(mod::CuModule, name::String, len::Integer)
67+
68+
Acquires a global array variable handle from a named global in a module.
69+
"""
70+
struct CuGlobalArray{T} # TODO: the functionality provided by this struct can most likely be merged into CuGlobal{T}
71+
buf::Mem.DeviceBuffer
72+
73+
function CuGlobalArray{T}(mod::CuModule, name::String, len::Integer) where T
74+
ptr_ref = Ref{CuPtr{Cvoid}}()
75+
nbytes_ref = Ref{Csize_t}()
76+
cuModuleGetGlobal_v2(ptr_ref, nbytes_ref, mod, name)
77+
if nbytes_ref[] != (sizeof(T) * len)
78+
throw(ArgumentError("size of global array '$name' ($(nbytes_ref[])) does not match given size (sizeof($T) * $length)"))
79+
end
80+
buf = Mem.DeviceBuffer(ptr_ref[], nbytes_ref[])
81+
82+
return new{T}(buf)
83+
end
84+
end
85+
86+
Base.eltype(::Type{CuGlobalArray{T}}) where {T} = T
87+
88+
Base.sizeof(global_array::CuGlobalArray{T}) where T = sizeof(global_array.buf)
89+
90+
function Base.copyto!(global_array::CuGlobalArray{T}, src::Array{T}) where T
91+
if sizeof(src) != sizeof(global_array)
92+
throw(DimensionMismatch("size of `src` ($(sizeof(src))) does not match global array ($(sizeof(global_array)))"))
93+
end
94+
cuMemcpyHtoD_v2(global_array.buf, src, sizeof(src))
95+
end

src/CUDA.jl

+1
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ include("array.jl")
6060
include("gpuarrays.jl")
6161
include("utilities.jl")
6262
include("texture.jl")
63+
include("memory_constant.jl")
6364

6465
# array libraries
6566
include("../lib/complex.jl")

src/compiler/gpucompiler.jl

+1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ function GPUCompiler.finish_module!(job::CUDACompilerJob, mod::LLVM.Module)
3535
Tuple{CompilerJob{PTXCompilerTarget}, typeof(mod)},
3636
job, mod)
3737
emit_exception_flag!(mod)
38+
emit_constant_memory_initializer!(mod)
3839
end
3940

4041
function GPUCompiler.link_libraries!(job::CUDACompilerJob, mod::LLVM.Module,

src/device/intrinsics.jl

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ include("intrinsics/memory_dynamic.jl")
1212
include("intrinsics/atomics.jl")
1313
include("intrinsics/misc.jl")
1414
include("intrinsics/wmma.jl")
15+
include("intrinsics/memory_constant.jl")
1516

1617
# functionality from libdevice
1718
#
+72
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
# Constant Memory
2+
3+
export CuDeviceConstantMemory
4+
5+
"""
6+
CuDeviceConstantMemory{T,N,Name,Shape}
7+
8+
The device-side counterpart of [`CuConstantMemory{T,N}`](@ref). This type should not be used
9+
directly except in the case of `CuConstantMemory` global variables, where it denotes the
10+
type of the relevant kernel argument.
11+
12+
Note that the `Name` and `Shape` type variables are implementation details and it
13+
discouraged to use them directly. Instead use [name(::CuConstantMemory)](@ref) and
14+
[Base.size(::CuConstantMemory)](@ref) respectively.
15+
"""
16+
struct CuDeviceConstantMemory{T,N,Name,Shape} <: AbstractArray{T,N} end
17+
18+
"""
19+
Get the name of underlying global variable of this `CuDeviceConstantMemory`.
20+
"""
21+
name(::CuDeviceConstantMemory{T,N,Name,Shape}) where {T,N,Name,Shape} = Name
22+
23+
Base.:(==)(A::CuDeviceConstantMemory, B::CuDeviceConstantMemory) = name(A) == name(B)
24+
Base.hash(A::CuDeviceConstantMemory, h::UInt) = hash(name(A), h)
25+
26+
Base.size(::CuDeviceConstantMemory{T,N,Name,Shape}) where {T,N,Name,Shape} = Shape
27+
28+
Base.@propagate_inbounds Base.getindex(A::CuDeviceConstantMemory, i::Integer) = constmemref(A, i)
29+
30+
Base.IndexStyle(::Type{<:CuDeviceConstantMemory}) = Base.IndexLinear()
31+
32+
@inline function constmemref(A::CuDeviceConstantMemory{T,N,Name,Shape}, index::Integer) where {T,N,Name,Shape}
33+
@boundscheck checkbounds(A, index)
34+
len = length(A)
35+
return read_constant_mem(Val(Name), index, T, Val(len))
36+
end
37+
38+
@generated function read_constant_mem(::Val{global_name}, index::Integer, ::Type{T}, ::Val{len}) where {global_name,T,len}
39+
JuliaContext() do ctx
40+
# define LLVM types
41+
T_int = convert(LLVMType, Int, ctx)
42+
T_result = convert(LLVMType, T, ctx)
43+
44+
# define function and get LLVM module
45+
param_types = [T_int]
46+
llvm_f, _ = create_function(T_result, param_types)
47+
mod = LLVM.parent(llvm_f)
48+
49+
# create a constant memory global variable
50+
T_global = LLVM.ArrayType(T_result, len)
51+
global_var = GlobalVariable(mod, T_global, string(global_name), AS.Constant)
52+
linkage!(global_var, LLVM.API.LLVMExternalLinkage) # NOTE: external linkage is the default
53+
extinit!(global_var, true)
54+
# TODO: global_var alignment?
55+
56+
# generate IR
57+
Builder(ctx) do builder
58+
entry = BasicBlock(llvm_f, "entry", ctx)
59+
position!(builder, entry)
60+
61+
typed_ptr = inbounds_gep!(builder, global_var, [ConstantInt(0, ctx), parameters(llvm_f)[1]])
62+
ld = load!(builder, typed_ptr)
63+
64+
metadata(ld)[LLVM.MD_tbaa] = tbaa_addrspace(AS.Constant, ctx)
65+
66+
ret!(builder, ld)
67+
end
68+
69+
# call the function
70+
call_function(llvm_f, T, Tuple{Int}, :((Int(index - one(index))),))
71+
end
72+
end

src/memory_constant.jl

+137
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
export CuConstantMemory
2+
3+
# Map a constant memory name to its array value
4+
const constant_memory_initializer = Dict{Symbol,WeakRef}()
5+
6+
"""
7+
CuConstantMemory{T,N}(value::Array{T,N})
8+
CuConstantMemory{T}(::UndefInitializer, dims::Integer...)
9+
CuConstantMemory{T}(::UndefInitializer, dims::Dims{N})
10+
11+
Construct an `N`-dimensional constant memory array of type `T`, where `isbits(T)`.
12+
13+
Note that `deepcopy` will be called on the `value` constructor argument, meaning that
14+
mutations to the original `value` or its elements after construction will not be
15+
reflected in the value of `CuConstantMemory`.
16+
17+
The `UndefInitializer` constructors behave exactly like the regular `Array` version,
18+
i.e. the value of `CuConstantMemory` will be completely random when using them.
19+
20+
Unlike in CUDA C, structs cannot be put directly into constant memory. This feature can
21+
be emulated however by wrapping the struct inside of a 1-element array.
22+
23+
When using `CuConstantMemory` as a global variable it is required to pass it as an argument
24+
to a kernel, where the argument is of type [`CuDeviceConstantMemory{T,N}`](@ref).
25+
When using `CuConstantMemory` as a local variable that is captured by a kernel closure
26+
this is not required, and it can be used directly like any other captured variable
27+
without passing it as an argument.
28+
29+
In cases where the same kernel object gets called mutiple times, and it is desired to mutate
30+
the value of a `CuConstantMemory` variable in this kernel between calls, please refer
31+
to [`Base.copyto!(const_mem::CuConstantMemory{T}, value::Array{T}, kernel::HostKernel)`](@ref)
32+
"""
33+
struct CuConstantMemory{T,N} <: AbstractArray{T,N}
34+
name::Symbol
35+
value::Array{T,N}
36+
37+
function CuConstantMemory(value::Array{T,N}) where {T,N}
38+
# TODO: add finalizer that removes the relevant entry from constant_memory_initializer?
39+
Base.isbitstype(T) || throw(ArgumentError("CuConstantMemory only supports bits types"))
40+
name = gensym("constant_memory")
41+
name = GPUCompiler.safe_name(string(name))
42+
name = Symbol(name)
43+
val = deepcopy(value)
44+
constant_memory_initializer[name] = WeakRef(val)
45+
return new{T,N}(name, val)
46+
end
47+
end
48+
49+
CuConstantMemory{T}(::UndefInitializer, dims::Integer...) where {T} =
50+
CuConstantMemory(Array{T}(undef, dims))
51+
CuConstantMemory{T}(::UndefInitializer, dims::Dims{N}) where {T,N} =
52+
CuConstantMemory(Array{T,N}(undef, dims))
53+
54+
Base.size(A::CuConstantMemory) = size(A.value)
55+
56+
Base.getindex(A::CuConstantMemory, i::Integer) = Base.getindex(A.value, i)
57+
Base.setindex!(A::CuConstantMemory, v, i::Integer) = Base.setindex!(A.value, v, i)
58+
Base.IndexStyle(::Type{<:CuConstantMemory}) = Base.IndexLinear()
59+
60+
Adapt.adapt_storage(::Adaptor, A::CuConstantMemory{T,N}) where {T,N} =
61+
CuDeviceConstantMemory{T,N,A.name,size(A.value)}()
62+
63+
64+
"""
65+
Given a `kernel` returned by `@cuda`, copy `value` into `const_mem` for subsequent calls to this `kernel`.
66+
If `const_mem` is not used within `kernel`, an error will be thrown.
67+
"""
68+
function Base.copyto!(const_mem::CuConstantMemory{T}, value::Array{T}, kernel::HostKernel) where T
69+
# TODO: add bool argument to also change the value field of const_mem?
70+
if size(const_mem) != size(value)
71+
throw(DimensionMismatch("size of `value` does not match size of constant memory"))
72+
end
73+
74+
global_array = CuGlobalArray{T}(kernel.mod, string(const_mem.name), length(const_mem))
75+
copyto!(global_array, value)
76+
end
77+
78+
79+
function emit_constant_memory_initializer!(mod::LLVM.Module)
80+
for global_var in globals(mod)
81+
T_global = llvmtype(global_var)
82+
if addrspace(T_global) == AS.Constant
83+
constant_memory_name = Symbol(LLVM.name(global_var))
84+
if !haskey(constant_memory_initializer, constant_memory_name)
85+
continue # non user defined constant memory, most likely from the CUDA runtime
86+
end
87+
88+
arr = constant_memory_initializer[constant_memory_name].value
89+
@assert !isnothing(arr) "calling kernel containing garbage collected constant memory"
90+
91+
flattened_arr = reduce(vcat, arr)
92+
ctx = LLVM.context(mod)
93+
typ = eltype(eltype(T_global))
94+
95+
# TODO: have a look at how julia converts structs to llvm:
96+
# https://github.com/JuliaLang/julia/blob/80ace52b03d9476f3d3e6ff6da42f04a8df1cf7b/src/cgutils.cpp#L572
97+
# this only seems to emit a type though
98+
if isa(typ, LLVM.IntegerType) || isa(typ, LLVM.FloatingPointType)
99+
init = ConstantArray(flattened_arr, ctx)
100+
elseif isa(typ, LLVM.ArrayType) # a struct with every field of the same type gets optimized to an array
101+
constant_arrays = LLVM.Constant[]
102+
for x in flattened_arr
103+
fields = collect(map(name->getfield(x, name), fieldnames(typeof(x))))
104+
constant_array = ConstantArray(fields, ctx)
105+
push!(constant_arrays, constant_array)
106+
end
107+
init = ConstantArray(typ, constant_arrays)
108+
elseif isa(typ, LLVM.StructType)
109+
constant_structs = LLVM.Constant[]
110+
for x in flattened_arr
111+
constants = LLVM.Constant[]
112+
for fieldname in fieldnames(typeof(x))
113+
field = getfield(x, fieldname)
114+
if isa(field, Bool)
115+
# NOTE: Bools get compiled to i8 instead of the more "correct" type i1
116+
push!(constants, ConstantInt(LLVM.Int8Type(ctx), field))
117+
elseif isa(field, Integer)
118+
push!(constants, ConstantInt(field, ctx))
119+
elseif isa(field, AbstractFloat)
120+
push!(constants, ConstantFP(field, ctx))
121+
else
122+
throw(error("constant memory does not currently support structs with non-primitive fields ($(typeof(x)).$fieldname::$(typeof(field)))"))
123+
end
124+
end
125+
const_struct = ConstantStruct(typ, constants)
126+
push!(constant_structs, const_struct)
127+
end
128+
init = ConstantArray(typ, constant_structs)
129+
else
130+
# unreachable, but let's be safe and throw a nice error message just in case
131+
throw(error("could not emit initializer for constant memory of type $typ"))
132+
end
133+
134+
initializer!(global_var, init)
135+
end
136+
end
137+
end

0 commit comments

Comments
 (0)