@@ -4,7 +4,16 @@ module DynamicExpressionsCUDAExt
4
4
using CUDA: @cuda , CuArray, blockDim, blockIdx, threadIdx
5
5
using DynamicExpressions: OperatorEnum, AbstractExpressionNode
6
6
using DynamicExpressions. EvaluateModule: get_nbin, get_nuna
7
- using DynamicExpressions. AsArrayModule: as_array
7
+ using DynamicExpressions. AsArrayModule:
8
+ as_array,
9
+ IDX_DEGREE,
10
+ IDX_FEATURE,
11
+ IDX_OP,
12
+ IDX_EXECUTION_ORDER,
13
+ IDX_SELF,
14
+ IDX_L,
15
+ IDX_R,
16
+ IDX_CONSTANT
8
17
using DispatchDoctor: @stable
9
18
10
19
import DynamicExpressions. EvaluateModule: eval_tree_array
59
68
# # in the input data by the number of nodes in the tree.
60
69
# # It has one extra row to store the constant values.
61
70
gworkspace = @something (gpu_workspace, similar (gcX, num_elem + 1 , num_nodes))
62
- gval = @view gworkspace[end , :]
63
71
if _update_buffers
64
- copyto! (gval , val)
72
+ copyto! (@view (gworkspace[ end , :]) , val)
65
73
end
74
+ val_idx = size (gworkspace, 1 )
66
75
67
- # # Index arrays (much faster to have `@view` here)
68
76
gbuffer = if ! _update_buffers
69
77
gpu_buffer
70
78
elseif gpu_buffer === nothing
73
81
copyto! (gpu_buffer, buffer)
74
82
end
75
83
76
- # ! format: off
77
- gdegree = @view gbuffer[1 , :]
78
- gfeature = @view gbuffer[2 , :]
79
- gop = @view gbuffer[3 , :]
80
- gexecution_order = @view gbuffer[4 , :]
81
- gidx_self = @view gbuffer[5 , :]
82
- gidx_l = @view gbuffer[6 , :]
83
- gidx_r = @view gbuffer[7 , :]
84
- gconstant = @view gbuffer[8 , :]
85
- # ! format: on
86
- # TODO : This is a bit dangerous as we're assuming exact indices
84
+ # Removed @view definitions of gdegree, gfeature, etc.
85
+ # We'll index directly into gbuffer using the constants above.
87
86
88
87
num_threads = 256
89
88
num_blocks = nextpow (2 , ceil (Int, num_elem * num_nodes / num_threads))
92
91
_launch_gpu_kernel! (
93
92
num_threads, num_blocks, num_launches, gworkspace,
94
93
# Thread info:
95
- num_elem, num_nodes, gexecution_order,
96
- # Input data and tree
97
- operators, gcX, gidx_self, gidx_l, gidx_r,
98
- gdegree, gconstant, gval, gfeature, gop,
94
+ num_elem, num_nodes,
95
+ # We'll pass gbuffer directly to the kernel now:
96
+ operators, gcX, gbuffer, val_idx,
99
97
)
100
98
# ! format: on
101
99
@@ -109,34 +107,30 @@ end
109
107
@stable default_mode = " disable" function _launch_gpu_kernel! (
110
108
num_threads, num_blocks, num_launches:: Integer , buffer:: AbstractArray{T,2} ,
111
109
# Thread info:
112
- num_elem:: Integer , num_nodes:: Integer , execution_order:: AbstractArray{I} ,
113
- # Input data and tree
114
- operators:: OperatorEnum , cX:: AbstractArray{T,2} , idx_self:: AbstractArray , idx_l:: AbstractArray , idx_r:: AbstractArray ,
115
- degree:: AbstractArray , constant:: AbstractArray , val:: AbstractArray{T,1} , feature:: AbstractArray , op:: AbstractArray ,
116
- ) where {I,T}
110
+ num_elem:: Integer , num_nodes:: Integer ,
111
+ operators:: OperatorEnum , cX:: AbstractArray{T,2} , gbuffer:: AbstractArray{Int32,2} ,
112
+ val_idx:: Integer
113
+ ) where {T}
117
114
# ! format: on
118
115
nuna = get_nuna (typeof (operators))
119
116
nbin = get_nbin (typeof (operators))
120
117
(nuna > 10 || nbin > 10 ) &&
121
118
error (" Too many operators. Kernels are only compiled up to 10." )
122
119
gpu_kernel! = create_gpu_kernel (operators, Val (nuna), Val (nbin))
123
- for launch in one (I) : I (num_launches)
120
+ for launch in one (Int32) : Int32 (num_launches)
124
121
# ! format: off
125
122
if buffer isa CuArray
126
123
@cuda threads= num_threads blocks= num_blocks gpu_kernel! (
127
124
buffer,
128
- launch, num_elem, num_nodes, execution_order,
129
- cX, idx_self, idx_l, idx_r,
130
- degree, constant, val, feature, op
125
+ launch, num_elem, num_nodes,
126
+ cX, gbuffer, val_idx
131
127
)
132
128
else
133
129
Threads. @threads for i in 1 : (num_threads * num_blocks)
134
130
gpu_kernel! (
135
131
buffer,
136
- launch, num_elem, num_nodes, execution_order,
137
- cX, idx_self, idx_l, idx_r,
138
- degree, constant, val, feature, op,
139
- i
132
+ launch, num_elem, num_nodes,
133
+ cX, gbuffer, val_idx, i
140
134
)
141
135
end
142
136
end
@@ -155,55 +149,53 @@ for nuna in 0:10, nbin in 0:10
155
149
@eval function create_gpu_kernel (operators:: OperatorEnum , :: Val{$nuna} , :: Val{$nbin} )
156
150
# ! format: off
157
151
function (
158
- # Storage:
159
152
buffer,
160
- # Thread info:
161
- launch:: Integer , num_elem:: Integer , num_nodes:: Integer , execution_order:: AbstractArray ,
162
- # Input data and tree
163
- cX:: AbstractArray , idx_self:: AbstractArray , idx_l:: AbstractArray , idx_r:: AbstractArray ,
164
- degree:: AbstractArray , constant:: AbstractArray , val:: AbstractArray , feature:: AbstractArray , op:: AbstractArray ,
165
- # Override for unittesting:
153
+ launch:: Integer , num_elem:: Integer , num_nodes:: Integer ,
154
+ cX:: AbstractArray , gbuffer:: AbstractArray{Int32,2} ,
155
+ val_idx:: Integer ,
166
156
i= nothing ,
167
157
)
168
- i = i === nothing ? (blockIdx (). x - 1 ) * blockDim (). x + threadIdx (). x : i
158
+ i = @something (i, (blockIdx (). x - 1 ) * blockDim (). x + threadIdx (). x)
169
159
if i > num_elem * num_nodes
170
160
return nothing
171
161
end
172
162
173
163
node = (i - 1 ) % num_nodes + 1
174
164
elem = (i - node) ÷ num_nodes + 1
175
165
176
- # ! format: off
166
+
177
167
@inbounds begin
178
- if execution_order[ node] != launch
168
+ if gbuffer[IDX_EXECUTION_ORDER, node] != launch
179
169
return nothing
180
170
end
181
171
182
- cur_degree = degree[node]
183
- cur_idx = idx_self[node]
172
+ # Use constants to index gbuffer:
173
+ cur_degree = gbuffer[IDX_DEGREE, node]
174
+ cur_idx = gbuffer[IDX_SELF, node]
175
+
184
176
if cur_degree == 0
185
- if constant[ node] == 1
186
- cur_val = val[ node]
177
+ if gbuffer[IDX_CONSTANT, node] == 1
178
+ cur_val = buffer[val_idx, node]
187
179
buffer[elem, cur_idx] = cur_val
188
180
else
189
- cur_feature = feature[ node]
181
+ cur_feature = gbuffer[IDX_FEATURE, node]
190
182
buffer[elem, cur_idx] = cX[cur_feature, elem]
191
183
end
192
184
else
193
185
if cur_degree == 1 && $ nuna > 0
194
- cur_op = op[ node]
195
- l_idx = idx_l[ node]
186
+ cur_op = gbuffer[IDX_OP, node]
187
+ l_idx = gbuffer[IDX_L, node]
196
188
Base. Cartesian. @nif (
197
189
$ nuna,
198
190
i -> i == cur_op,
199
191
i -> let op = operators. unaops[i]
200
192
buffer[elem, cur_idx] = op (buffer[elem, l_idx])
201
193
end
202
194
)
203
- elseif $ nbin > 0 # Note this check is to avoid type inference issues when binops is empty
204
- cur_op = op[ node]
205
- l_idx = idx_l[ node]
206
- r_idx = idx_r[ node]
195
+ elseif $ nbin > 0
196
+ cur_op = gbuffer[IDX_OP, node]
197
+ l_idx = gbuffer[IDX_L, node]
198
+ r_idx = gbuffer[IDX_R, node]
207
199
Base. Cartesian. @nif (
208
200
$ nbin,
209
201
i -> i == cur_op,
0 commit comments