|
25 | 25 | import dpctl
|
26 | 26 | import dpctl.memory as dpctl_mem
|
27 | 27 | import dpctl.program as dpctl_prog
|
| 28 | +import dpctl.tensor as dpt |
28 | 29 |
|
29 | 30 |
|
30 | 31 | @pytest.mark.parametrize(
|
@@ -107,4 +108,94 @@ def test_create_program_from_source(ctype_str, dtype, ctypes_ctor):
|
107 | 108 | ref_c = a * np.array(d, dtype=dtype) + b
|
108 | 109 | host_dt, device_dt = timer.dt
|
109 | 110 | assert type(host_dt) is float and type(device_dt) is float
|
110 |
| - assert np.allclose(c, ref_c), "Faled for {}, {}".formatg(r, lr) |
| 111 | + assert np.allclose(c, ref_c), "Failed for {}, {}".formatg(r, lr) |
| 112 | + |
| 113 | + |
| 114 | +def test_async_submit(): |
| 115 | + try: |
| 116 | + q = dpctl.SyclQueue("opencl") |
| 117 | + except dpctl.SyclQueueCreationError: |
| 118 | + pytest.skip("OpenCL queue could not be created") |
| 119 | + oclSrc = ( |
| 120 | + "kernel void kern1(global unsigned int *res, unsigned int mod) {" |
| 121 | + " size_t index = get_global_id(0);" |
| 122 | + " int ri = (index % mod);" |
| 123 | + " res[index] = (ri * ri) % mod;" |
| 124 | + "}" |
| 125 | + " " |
| 126 | + "kernel void kern2(global unsigned int *res, unsigned int mod) {" |
| 127 | + " size_t index = get_global_id(0);" |
| 128 | + " int ri = (index % mod);" |
| 129 | + " int ri2 = (ri * ri) % mod;" |
| 130 | + " res[index] = (ri2 * ri) % mod;" |
| 131 | + "}" |
| 132 | + " " |
| 133 | + "kernel void kern3(" |
| 134 | + " global unsigned int *res, global unsigned int *arg1, " |
| 135 | + " global unsigned int *arg2)" |
| 136 | + "{" |
| 137 | + " size_t index = get_global_id(0);" |
| 138 | + " res[index] = " |
| 139 | + " (arg1[index] < arg2[index]) ? arg1[index] : arg2[index];" |
| 140 | + "}" |
| 141 | + ) |
| 142 | + prog = dpctl_prog.create_program_from_source(q, oclSrc) |
| 143 | + kern1Kernel = prog.get_sycl_kernel("kern1") |
| 144 | + kern2Kernel = prog.get_sycl_kernel("kern2") |
| 145 | + kern3Kernel = prog.get_sycl_kernel("kern3") |
| 146 | + |
| 147 | + assert isinstance(kern1Kernel, dpctl_prog.SyclKernel) |
| 148 | + assert isinstance(kern2Kernel, dpctl_prog.SyclKernel) |
| 149 | + assert isinstance(kern2Kernel, dpctl_prog.SyclKernel) |
| 150 | + |
| 151 | + n = 1024 * 512 |
| 152 | + X = dpt.empty((3, n), dtype="u4", usm_type="device", sycl_queue=q) |
| 153 | + first_row = dpctl_mem.as_usm_memory(X[0]) |
| 154 | + second_row = dpctl_mem.as_usm_memory(X[1]) |
| 155 | + third_row = dpctl_mem.as_usm_memory(X[2]) |
| 156 | + |
| 157 | + e1 = q.submit( |
| 158 | + kern1Kernel, |
| 159 | + [ |
| 160 | + first_row, |
| 161 | + ctypes.c_uint(17), |
| 162 | + ], |
| 163 | + [ |
| 164 | + n, |
| 165 | + ], |
| 166 | + ) |
| 167 | + e2 = q.submit( |
| 168 | + kern2Kernel, |
| 169 | + [ |
| 170 | + second_row, |
| 171 | + ctypes.c_uint(27), |
| 172 | + ], |
| 173 | + [ |
| 174 | + n, |
| 175 | + ], |
| 176 | + ) |
| 177 | + e3 = q.submit( |
| 178 | + kern3Kernel, |
| 179 | + [third_row, first_row, second_row], |
| 180 | + [ |
| 181 | + n, |
| 182 | + ], |
| 183 | + None, |
| 184 | + [e1, e2], |
| 185 | + ) |
| 186 | + status_complete = dpctl.event_status_type.complete |
| 187 | + assert status_complete not in ( |
| 188 | + e1.execution_status, |
| 189 | + e2.execution_status, |
| 190 | + e3.execution_status, |
| 191 | + ) |
| 192 | + |
| 193 | + e3.wait() |
| 194 | + Xnp = dpt.asnumpy(X) |
| 195 | + Xref = np.empty((3, n), dtype="u4") |
| 196 | + for i in range(n): |
| 197 | + Xref[0, i] = (i * i) % 17 |
| 198 | + Xref[1, i] = (i * i * i) % 27 |
| 199 | + Xref[2, i] = min(Xref[0, i], Xref[1, i]) |
| 200 | + |
| 201 | + assert np.array_equal(Xnp, Xref) |
0 commit comments