Skip to content

Commit fade845

Browse files
authored
Merge pull request #2030 from JuliaGPU/tb/fastmath
Add support for @cuda fastmath
2 parents ac74718 + a2d3219 commit fade845

File tree

4 files changed

+31
-4
lines changed

4 files changed

+31
-4
lines changed

Manifest.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,9 +142,9 @@ version = "0.1.5"
142142

143143
[[GPUCompiler]]
144144
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"]
145-
git-tree-sha1 = "72b2e3c2ba583d1a7aa35129e56cf92e07c083e3"
145+
git-tree-sha1 = "8de395b1243771bbb79ac832ec96c7def7a4586f"
146146
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
147-
version = "0.21.4"
147+
version = "0.22.0"
148148

149149
[[InlineStrings]]
150150
deps = ["Parsers"]

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ Crayons = "4"
4646
DataFrames = "1"
4747
ExprTools = "0.1"
4848
GPUArrays = "8.6"
49-
GPUCompiler = "0.21"
49+
GPUCompiler = "0.22"
5050
KernelAbstractions = "0.9.2"
5151
LLVM = "6"
5252
Preferences = "1"

src/compiler/execution.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ export @cuda, cudaconvert, cufunction, dynamic_cufunction, nextwarp, prevwarp
66
## high-level @cuda interface
77

88
const MACRO_KWARGS = [:dynamic, :launch]
9-
const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs]
9+
const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath]
1010
const LAUNCH_KWARGS = [:cooperative, :blocks, :threads, :shmem, :stream]
1111

1212

@@ -306,6 +306,7 @@ The following keyword arguments are supported:
306306
supported on LLVM 4.0+)
307307
- `name`: override the name that the kernel will have in the generated code
308308
- `always_inline`: inline all function calls in the kernel
309+
- `fastmath`: use less precise square roots and flush denormals
309310
310311
The output of this function is automatically cached, i.e. you can simply call `cufunction`
311312
in a hot path without degrading performance. New code will be generated automatically, when

test/core/codegen.jl

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,32 @@ end
157157
@test !occursin(".local", asm)
158158
end
159159

160+
@testset "fastmath" begin
161+
function div_kernel(x)
162+
i = threadIdx().x
163+
@fastmath @inbounds x[i] = 1 / x[i]
164+
return
165+
end
166+
167+
asm = sprint(io->CUDA.code_ptx(io, div_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true))
168+
@test occursin("div.approx.ftz", asm)
169+
170+
# libdevice only contains fast math versions of sqrt for CUDA 11.1+
171+
if CUDA.runtime_version() >= v"11.1"
172+
function sqrt_kernel(x)
173+
i = threadIdx().x
174+
@inbounds x[i] = sqrt(x[i])
175+
return
176+
end
177+
178+
asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}))
179+
@test occursin("sqrt.r", asm)
180+
181+
asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true))
182+
@test occursin("sqrt.approx.ftz", asm)
183+
end
184+
end
185+
160186
end
161187

162188
############################################################################################

0 commit comments

Comments
 (0)