-
Notifications
You must be signed in to change notification settings - Fork 242
Support cuTENSOR contractors for 1D views #2650
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/lib/cutensor/src/types.jl b/lib/cutensor/src/types.jl
index 68b7865d2..b014a76bd 100644
--- a/lib/cutensor/src/types.jl
+++ b/lib/cutensor/src/types.jl
@@ -202,7 +202,8 @@ mutable struct CuTensorDescriptor
handle::cutensorTensorDescriptor_t
# inner constructor handles creation and finalizer of the descriptor
function CuTensorDescriptor(sz::Vector{Int64}, st::Vector{Int64}, eltype::DataType,
- alignmentRequirement::UInt32=CUTENSOR_ALIGNMENT)
+ alignmentRequirement::UInt32 = CUTENSOR_ALIGNMENT
+ )
desc = Ref{cutensorTensorDescriptor_t}()
length(st) == (N = length(sz)) || throw(ArgumentError("size and stride vectors must have the same length"))
cutensorCreateTensorDescriptor(handle(), desc, N, sz, st, eltype, alignmentRequirement)
diff --git a/lib/cutensor/test/contractions.jl b/lib/cutensor/test/contractions.jl
index 5a14f3d74..e9f8ec082 100644
--- a/lib/cutensor/test/contractions.jl
+++ b/lib/cutensor/test/contractions.jl
@@ -164,33 +164,33 @@ eltypes = [(Float32, Float32, Float32, Float32),
end
end
-# https://github.com/JuliaGPU/CUDA.jl/issues/2407
-@testset "contractions of views" begin
- @testset for (eltyA, eltyB, eltyC, eltyCompute) in eltypes
- dimsA = (16,)
- dimsB = (4,)
- dimsC = (8,)
- A = rand(eltyA, dimsA)
- B = rand(eltyB, dimsB)
- C = rand(eltyC, dimsC)
- dA = CuArray(A)
- dB = CuArray(B)
- dC = CuArray(C)
- dD = CuArray(C)
- vA = @view dA[1:4]
- vB = @view dB[4:4]
- vC = @view dC[3:6]
- vD = @view dD[3:6]
- tA = CuTensor(reshape(vA, (4, 1)), [1, 2])
- tB = CuTensor(reshape(vB, (1, 1)), [3, 2])
- tC = CuTensor(reshape(vC, (1, 4)), [3, 1])
- mul!(tC, tA, tB)
- tA2 = CuTensor(copy(vA), [1, 2])
- tB2 = CuTensor(copy(vB), [3, 2])
- tD = CuTensor(copy(vD), [3, 1])
- mul!(tD, tA2, tB2)
- @test tD.data ≈ tD.data
+ # https://github.com/JuliaGPU/CUDA.jl/issues/2407
+ @testset "contractions of views" begin
+ @testset for (eltyA, eltyB, eltyC, eltyCompute) in eltypes
+ dimsA = (16,)
+ dimsB = (4,)
+ dimsC = (8,)
+ A = rand(eltyA, dimsA)
+ B = rand(eltyB, dimsB)
+ C = rand(eltyC, dimsC)
+ dA = CuArray(A)
+ dB = CuArray(B)
+ dC = CuArray(C)
+ dD = CuArray(C)
+ vA = @view dA[1:4]
+ vB = @view dB[4:4]
+ vC = @view dC[3:6]
+ vD = @view dD[3:6]
+ tA = CuTensor(reshape(vA, (4, 1)), [1, 2])
+ tB = CuTensor(reshape(vB, (1, 1)), [3, 2])
+ tC = CuTensor(reshape(vC, (1, 4)), [3, 1])
+ mul!(tC, tA, tB)
+ tA2 = CuTensor(copy(vA), [1, 2])
+ tB2 = CuTensor(copy(vB), [3, 2])
+ tD = CuTensor(copy(vD), [3, 1])
+ mul!(tD, tA2, tB2)
+ @test tD.data ≈ tD.data
+ end
end
-end
end |
Codecov ReportAll modified and coverable lines are covered by tests ✅
Additional details and impacted files@@ Coverage Diff @@
## master #2650 +/- ##
=======================================
Coverage 73.50% 73.50%
=======================================
Files 158 158
Lines 15321 15324 +3
=======================================
+ Hits 11261 11264 +3
Misses 4060 4060 ☔ View full report in Codecov by Sentry. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
CUDA.jl Benchmarks
Benchmark suite | Current: bc175aa | Previous: f62af73 | Ratio |
---|---|---|---|
latency/precompile |
46455682564.5 ns |
46667053506.5 ns |
1.00 |
latency/ttfp |
6952404648 ns |
6954689688 ns |
1.00 |
latency/import |
3610907756 ns |
3631856123 ns |
0.99 |
integration/volumerhs |
9625805.5 ns |
9624743.5 ns |
1.00 |
integration/byval/slices=1 |
146896 ns |
146953 ns |
1.00 |
integration/byval/slices=3 |
425201 ns |
425334 ns |
1.00 |
integration/byval/reference |
144890 ns |
145208 ns |
1.00 |
integration/byval/slices=2 |
286139 ns |
286016 ns |
1.00 |
integration/cudadevrt |
103259 ns |
103424 ns |
1.00 |
kernel/indexing |
14102 ns |
14214 ns |
0.99 |
kernel/indexing_checked |
14572 ns |
14910 ns |
0.98 |
kernel/occupancy |
637.8 ns |
637.5449101796407 ns |
1.00 |
kernel/launch |
2016.1 ns |
2102 ns |
0.96 |
kernel/rand |
15027 ns |
18239 ns |
0.82 |
array/reverse/1d |
19976 ns |
19474 ns |
1.03 |
array/reverse/2d |
25056 ns |
23910 ns |
1.05 |
array/reverse/1d_inplace |
10901.333333333334 ns |
10670 ns |
1.02 |
array/reverse/2d_inplace |
11355 ns |
12291 ns |
0.92 |
array/copy |
21126 ns |
20955 ns |
1.01 |
array/iteration/findall/int |
159837 ns |
155336 ns |
1.03 |
array/iteration/findall/bool |
140579 ns |
133979 ns |
1.05 |
array/iteration/findfirst/int |
155148.5 ns |
154049 ns |
1.01 |
array/iteration/findfirst/bool |
156103 ns |
153056 ns |
1.02 |
array/iteration/scalar |
71680 ns |
61530 ns |
1.16 |
array/iteration/logical |
217035.5 ns |
202309 ns |
1.07 |
array/iteration/findmin/1d |
42575 ns |
37878 ns |
1.12 |
array/iteration/findmin/2d |
94370.5 ns |
93537 ns |
1.01 |
array/reductions/reduce/1d |
45523 ns |
37060.5 ns |
1.23 |
array/reductions/reduce/2d |
50983.5 ns |
50765 ns |
1.00 |
array/reductions/mapreduce/1d |
39076.5 ns |
36727 ns |
1.06 |
array/reductions/mapreduce/2d |
51964 ns |
42618.5 ns |
1.22 |
array/broadcast |
21183 ns |
20743 ns |
1.02 |
array/copyto!/gpu_to_gpu |
11902.5 ns |
13730.5 ns |
0.87 |
array/copyto!/cpu_to_gpu |
211185 ns |
207788 ns |
1.02 |
array/copyto!/gpu_to_cpu |
245806 ns |
243117 ns |
1.01 |
array/accumulate/1d |
109476 ns |
108517 ns |
1.01 |
array/accumulate/2d |
80318 ns |
79641 ns |
1.01 |
array/construct |
1315.5 ns |
1306.5 ns |
1.01 |
array/random/randn/Float32 |
49329.5 ns |
43234.5 ns |
1.14 |
array/random/randn!/Float32 |
26838 ns |
26328 ns |
1.02 |
array/random/rand!/Int64 |
27100 ns |
27074 ns |
1.00 |
array/random/rand!/Float32 |
8577.666666666666 ns |
8647.666666666666 ns |
0.99 |
array/random/rand/Int64 |
30079 ns |
29948 ns |
1.00 |
array/random/rand/Float32 |
13234 ns |
13039 ns |
1.01 |
array/permutedims/4d |
61252 ns |
60777 ns |
1.01 |
array/permutedims/2d |
55497 ns |
55571 ns |
1.00 |
array/permutedims/3d |
56650 ns |
55866 ns |
1.01 |
array/sorting/1d |
2777654 ns |
2764795 ns |
1.00 |
array/sorting/by |
3368720 ns |
3367795 ns |
1.00 |
array/sorting/2d |
1085641.5 ns |
1084334 ns |
1.00 |
cuda/synchronization/stream/auto |
1059.9 ns |
1052.3 ns |
1.01 |
cuda/synchronization/stream/nonblocking |
6416 ns |
6404.4 ns |
1.00 |
cuda/synchronization/stream/blocking |
796.6116504854369 ns |
810.0736842105263 ns |
0.98 |
cuda/synchronization/context/auto |
1177.7 ns |
1185.6 ns |
0.99 |
cuda/synchronization/context/nonblocking |
6656.8 ns |
6726.6 ns |
0.99 |
cuda/synchronization/context/blocking |
899.6458333333334 ns |
925.975 ns |
0.97 |
This comment was automatically generated by workflow using github-action-benchmark.
Should address #2407
Relatedly we should consider what to do in the more generic case of a reshaped
n
-dimCuArray
wheren > 1
, but I think that is a separate issue.