Description
Describe the issue
Consider the following code:
#!/usr/bin/env python3
import triton
import torch
@triton.jit
def test(x, output, N: triton.language.constexpr, D: triton.language.constexpr):
x = triton.language.make_block_ptr(
x,
shape=(N, D),
strides=(D, 1),
offsets=(0, 0),
block_shape=(N, D),
order=(0, 1),
)
x = triton.language.load(x)
output = triton.language.make_block_ptr(
output,
shape=(D, D),
strides=(D, 1),
offsets=(0, 0),
block_shape=(D, D),
order=(0, 1),
)
triton.language.store(output, triton.language.dot(x.trans(), x))
if __name__ == "__main__":
torch.cuda.manual_seed(0)
N = 256
D = 16
x = torch.randn(N, D, dtype=torch.float16, device="cuda")
output = torch.zeros(D, D, device="cuda")
test[(1,)](x, output, N, D)
This does an outer product across the N dimension; namely a 16x256x16 matrix multiply. Unfortunately, the code generated for this is quite poor; Triton loads x just fine but then it tries to transpose it element-by-element (this is immediately obvious in the SASS code, which is full of LDS.U16 or whatever). This causes extreme traffic to shared memory subsystem; so much so that it is actually faster to double load x from global memory (the second time with transposed strides) so that the inputs the product are not transposed.
I would expect that this code instead used the transposed version of the MMA instructions, or barring that, did a shuffle in registers rather than trying to do it element-by-element in shared memory. Or even vectorized the loads.
For reference, here's the code I pulled out of Nsight Compute for this:
test
1 0000e16f e3246800 LDC R1, c[0x0][0x28] 7 0.24% 32
2 0000e16f e3246810 S2R R24, SR_TID.X 8 0.24% 32
3 0000e16f e3246820 ULDC.64 UR4, c[0x0][0x210] 8 0.24% 32
4 0000e16f e3246830 ULDC.64 UR6, c[0x0][0x208] 8 0.24% 32
5 0000e16f e3246840 LOP3.LUT R40, R24, 0x4, RZ, 0xc0, !PT 9 0.24% 32
6 0000e16f e3246850 SHF.R.U32.HI R2, RZ, 0x1, R24 10 0.24% 32
7 0000e16f e3246860 LOP3.LUT R0, R40, 0x8, R24, 0xf8, !PT 11 0.24% 32
8 0000e16f e3246870 SGXT.U32 R2, R2, 0x1 11 0.24% 32
9 0000e16f e3246880 SHF.R.U32.HI R3, RZ, 0x1, R0 12 0.24% 32
10 0000e16f e3246890 LOP3.LUT R0, R24, 0x20, RZ, 0xc0, !PT 12 0.24% 32
11 0000e16f e32468a0 SHF.R.U32.HI R14, RZ, 0x1, R24 13 0.24% 32
12 0000e16f e32468b0 LOP3.LUT R3, R3, R2, RZ, 0xfc, !PT 13 0.24% 32
13 0000e16f e32468c0 SHF.R.U32.HI R16, RZ, 0x1, R0 13 0.24% 32
14 0000e16f e32468d0 LOP3.LUT R15, R3, 0x8, R14, 0xf8, !PT 14 0.24% 32
15 0000e16f e32468e0 LOP3.LUT R12, R24, 0x1, RZ, 0xc0, !PT 14 0.24% 32
16 0000e16f e32468f0 LOP3.LUT R13, R15, R16, RZ, 0xfc, !PT 15 0.24% 32
17 0000e16f e3246900 IMAD.SHL.U32 R17, R12, 0x8, RZ 16 0.24% 32
18 0000e16f e3246910 LOP3.LUT R13, R13, 0x20, R14, 0xf8, !PT 16 0.24% 32
19 0000e16f e3246920 LOP3.LUT R4, R13, 0x40, RZ, 0x3c, !PT 17 0.24% 32
20 0000e16f e3246930 IMAD.WIDE.U32 R2, R13, 0x10, RZ 19 0.24% 32
21 0000e16f e3246940 LOP3.LUT R6, R13, 0x80, RZ, 0x3c, !PT 20 0.24% 32
22 0000e16f e3246950 LOP3.LUT R8, R13, 0xc0, RZ, 0x3c, !PT 21 0.24% 32
23 0000e16f e3246960 IMAD.WIDE.U32 R4, R4, 0x10, RZ 22 0.24% 32
24 0000e16f e3246970 LOP3.LUT R2, R2, R17, RZ, 0xfc, !PT 22 0.24% 32
25 0000e16f e3246980 IMAD.WIDE.U32 R6, R6, 0x10, RZ 23 0.24% 32
26 0000e16f e3246990 LOP3.LUT R19, R4, R17, RZ, 0xfc, !PT 24 0.24% 32
27 0000e16f e32469a0 LEA R10, P0, R2, UR4, 0x1 24 0.24% 32
28 0000e16f e32469b0 IMAD.WIDE.U32 R8, R8, 0x10, RZ 25 0.24% 32
29 0000e16f e32469c0 LOP3.LUT R6, R6, R17, RZ, 0xfc, !PT 25 0.24% 32
30 0000e16f e32469d0 LEA R4, P1, R19, UR4, 0x1 26 0.24% 32
31 0000e16f e32469e0 LOP3.LUT R8, R8, R17, RZ, 0xfc, !PT 26 0.24% 32
32 0000e16f e32469f0 LEA R20, P2, R6, UR4, 0x1 27 0.24% 32
33 0000e16f e3246a00 LEA R60, P3, R8, UR4, 0x1 28 0.24% 32
34 0000e16f e3246a10 LEA.HI.X R11, R2, UR5, R3, 0x1, P0 29 0.24% 32
35 0000e16f e3246a20 LEA.HI.X R5, R19, UR5, R5, 0x1, P1 27 0.24% 32
36 0000e16f e3246a30 LEA.HI.X R21, R6, UR5, R7, 0x1, P2 27 0.24% 32
37 0000e16f e3246a40 LEA.HI.X R61, R8, UR5, R9, 0x1, P3 26 0.24% 32
38 0000e16f e3246a50 LDG.E.128 R8, desc[UR6][R10.64] 26 0.24% 32 Global Load 128
39 0000e16f e3246a60 LDG.E.128 R4, desc[UR6][R4.64] 28 0.24% 32 Global Load 128
40 0000e16f e3246a70 LDG.E.128 R20, desc[UR6][R20.64] 30 0.24% 32 Global Load 128
41 0000e16f e3246a80 LDG.E.128 R60, desc[UR6][R60.64] 32 0.24% 32 Global Load 128
42 0000e16f e3246a90 S2UR UR5, SR_CgaCtaId 32 0.24% 32
43 0000e16f e3246aa0 UMOV UR4, 0x400 32 0.24% 32
44 0000e16f e3246ab0 IMAD.SHL.U32 R65, R24, 0x10, RZ 33 0.24% 32
45 0000e16f e3246ac0 LOP3.LUT R14, R16, 0x20, R14, 0xf8, !PT 33 0.24% 32
46 0000e16f e3246ad0 IMAD.SHL.U32 R2, R24, 0x2, RZ 33 0.24% 32
47 0000e16f e3246ae0 LOP3.LUT R17, R17, 0x8, R24, 0x78, !PT 33 0.24% 32
48 0000e16f e3246af0 IMAD.SHL.U32 R16, R12, 0x200, RZ 33 0.24% 32
49 0000e16f e3246b00 LOP3.LUT R3, R65, 0x1c0, RZ, 0xc0, !PT 34 0.24% 32
50 0000e16f e3246b10 LOP3.LUT R15, R15, R14, RZ, 0xfc, !PT 34 0.24% 32
51 0000e16f e3246b20 IMAD.SHL.U32 R14, R17, 0x2, RZ 34 0.24% 32
52 0000e16f e3246b30 LOP3.LUT R19, R3, 0x200, RZ, 0xfc, !PT 34 0.24% 32
53 0000e16f e3246b40 LOP3.LUT R18, R3, 0x6, R2, 0xf8, !PT 35 0.24% 32
54 0000e16f e3246b50 IMAD R15, R15, 0x20, R14 34 0.24% 32
55 0000e16f e3246b60 LOP3.LUT R16, R13, R16, RZ, 0xfc, !PT 34 0.24% 32
56 0000e16f e3246b70 IMAD R13, R13, 0x20, R14 34 0.24% 32
57 0000e16f e3246b80 SHF.R.U32.HI R0, RZ, 0x2, R0 33 0.24% 32
58 0000e16f e3246b90 LOP3.LUT R44, R16, 0x40, RZ, 0x3c, !PT 34 0.24% 32
59 0000e16f e3246ba0 LOP3.LUT R52, R16, 0x80, RZ, 0x3c, !PT 35 0.24% 32
60 0000e16f e3246bb0 LOP3.LUT R69, R16, 0x100, RZ, 0x3c, !PT 36 0.24% 32
61 0000e16f e3246bc0 ULEA UR4, UR5, UR4, 0x18 36 0.24% 32
62 0000e16f e3246bd0 SHF.R.U32.HI R17, RZ, 0x5, R52 37 0.24% 32
63 0000e16f e3246be0 LOP3.LUT R66, R16, 0x140, RZ, 0x3c, !PT 38 0.24% 32
64 0000e16f e3246bf0 UIADD3 UR5, UR4, 0x2000, URZ 38 0.24% 32
65 0000e16f e3246c00 LOP3.LUT R53, R16, 0xc0, RZ, 0x3c, !PT 39 0.24% 32
66 0000e16f e3246c10 LOP3.LUT R68, R16, 0x180, RZ, 0x3c, !PT 40 0.24% 32
67 0000e16f e3246c20 LOP3.LUT R67, R16, 0x1c0, RZ, 0x3c, !PT 41 0.24% 32
68 0000e16f e3246c30 LEA.HI R3, R3, UR5, RZ, 0x1b 41 0.24% 32
69 0000e16f e3246c40 LEA.HI R19, R19, UR5, RZ, 0x1b 41 0.24% 32
70 0000e16f e3246c50 LEA R45, R12, UR5, 0x4 42 0.24% 32
71 0000e16f e3246c60 IMAD R2, R18, 0x2, R3 42 0.24% 32
72 0000e16f e3246c70 SHF.R.U32.HI R12, RZ, 0x5, R69 42 0.24% 32
73 0000e16f e3246c80 IMAD R3, R18, 0x2, R19 43 0.24% 32
74 0000e16f e3246c90 SHF.R.U32.HI R18, RZ, 0x5, R44 42 0.24% 32
75 0000e16f e3246ca0 IMAD R45, R16, 0x2, R45 42 0.24% 32
76 0000e16f e3246cb0 LOP3.LUT R19, R17, 0x7fffffe, RZ, 0xc0, !PT 42 0.24% 32
77 0000e16f e3246cc0 LOP3.LUT R18, R18, 0x7fffffe, RZ, 0xc0, !PT 41 0.24% 32
78 0000e16f e3246cd0 LOP3.LUT R12, R12, 0x7fffffe, RZ, 0xc0, !PT 41 0.24% 32
79 0000e16f e3246ce0 VIADD R19, R19, UR5 41 0.24% 32
80 0000e16f e3246cf0 SHF.R.U32.HI R14, RZ, 0x5, R66 42 0.24% 32
81 0000e16f e3246d00 VIADD R17, R18, UR5 43 0.24% 32
82 0000e16f e3246d10 SHF.R.U32.HI R24, RZ, 0x5, R53 43 0.24% 32
83 0000e16f e3246d20 VIADD R12, R12, UR5 43 0.24% 32
84 0000e16f e3246d30 SHF.R.U32.HI R16, RZ, 0x5, R68 44 0.24% 32
85 0000e16f e3246d40 IMAD R44, R44, 0x2, R17 44 0.24% 32
86 0000e16f e3246d50 SHF.R.U32.HI R17, RZ, 0x5, R67 44 0.24% 32
87 0000e16f e3246d60 IMAD R69, R69, 0x2, R12 44 0.24% 32
88 0000e16f e3246d70 LOP3.LUT R14, R14, 0x7fffffe, RZ, 0xc0, !PT 43 0.24% 32
89 0000e16f e3246d80 IMAD R52, R52, 0x2, R19 43 0.24% 32
90 0000e16f e3246d90 LOP3.LUT R24, R24, 0x7fffffe, RZ, 0xc0, !PT 42 0.24% 32
91 0000e16f e3246da0 LOP3.LUT R16, R16, 0x7fffffe, RZ, 0xc0, !PT 42 0.24% 32
92 0000e16f e3246db0 LOP3.LUT R18, R17, 0x7fffffe, RZ, 0xc0, !PT 43 0.24% 32
93 0000e16f e3246dc0 VIADD R17, R14, UR5 43 0.24% 32
94 0000e16f e3246dd0 LOP3.LUT R12, R15, 0x800, RZ, 0xfc, !PT 43 0.24% 32
95 0000e16f e3246de0 VIADD R24, R24, UR5 43 0.24% 32
96 0000e16f e3246df0 LOP3.LUT R14, R15, 0x1000, RZ, 0xfc, !PT 44 0.24% 32
97 0000e16f e3246e00 VIADD R19, R16, UR5 45 0.24% 32
98 0000e16f e3246e10 LOP3.LUT R15, R15, 0x1800, RZ, 0xfc, !PT 45 0.24% 32
99 0000e16f e3246e20 VIADD R18, R18, UR5 45 0.24% 32
100 0000e16f e3246e30 IMAD R66, R66, 0x2, R17 45 0.24% 32
101 0000e16f e3246e40 IMAD R53, R53, 0x2, R24 45 0.24% 32
102 0000e16f e3246e50 IMAD R68, R68, 0x2, R19 44 0.24% 32
103 0000e16f e3246e60 IMAD R67, R67, 0x2, R18 44 0.24% 32
104 0000e16f e3246e70 STS.128 [R13+UR4], R8 44 33.33% 0.24% 32 Shared Store 128
105 0000e16f e3246e80 PRMT R16, R8, 0x7632, R16 43 0.24% 32
106 0000e16f e3246e90 PRMT R17, R9, 0x7632, R17 43 0.24% 32
107 0000e16f e3246ea0 STS.128 [R12+UR4], R4 43 0.24% 32 Shared Store 128
108 0000e16f e3246eb0 PRMT R33, R10, 0x7632, R33 42 0.24% 32
109 0000e16f e3246ec0 PRMT R18, R11, 0x7632, R18 42 0.24% 32
110 0000e16f e3246ed0 STS.128 [R14+UR4], R20 42 0.24% 32 Shared Store 128
111 0000e16f e3246ee0 PRMT R19, R4, 0x7632, R19 41 0.24% 32
112 0000e16f e3246ef0 PRMT R55, R5, 0x7632, R55 41 0.24% 32
113 0000e16f e3246f00 STS.128 [R15+UR4], R60 41 0.24% 32 Shared Store 128
114 0000e16f e3246f10 PRMT R56, R6, 0x7632, R56 40 0.24% 32
115 0000e16f e3246f20 PRMT R57, R7, 0x7632, R57 40 0.24% 32
116 0000e16f e3246f30 STS.U16 [R45], R8 40 0.24% 32 Shared Store 16 0.04%
117 0000e16f e3246f40 PRMT R59, R21, 0x7632, R59 39 0.24% 32
118 0000e16f e3246f50 PRMT R64, R22, 0x7632, R64 39 0.24% 32
119 0000e16f e3246f60 STS.U16 [R44], R16 39 0.24% 32 Shared Store 16 0.04%
120 0000e16f e3246f70 STS.U16 [R52], R9 38 0.24% 32 Shared Store 16 0.04%
121 0000e16f e3246f80 STS.U16 [R53], R17 37 0.24% 32 Shared Store 16 0.04%
122 0000e16f e3246f90 STS.U16 [R69], R10 36 0.24% 32 Shared Store 16 0.04%
123 0000e16f e3246fa0 STS.U16 [R66], R33 35 0.24% 32 Shared Store 16 0.04%
124 0000e16f e3246fb0 STS.U16 [R68], R11 34 0.24% 32 Shared Store 16 0.04%
125 0000e16f e3246fc0 STS.U16 [R67], R18 33 0.24% 32 Shared Store 16 0.04%
126 0000e16f e3246fd0 BAR.SYNC.DEFER_BLOCKING 0x0 32 0.24% 32
127 0000e16f e3246fe0 LDS.U16 R48, [R2] 33 0.24% 32 Shared Load 16 0.27%
128 0000e16f e3246ff0 LDS.U16 R47, [R2+0x2] 34 0.24% 32 Shared Load 16 0.23%
129 0000e16f e3247000 LDS.U16 R49, [R3+0x400] 35 0.24% 32 Shared Load 16 0.27%
130 0000e16f e3247010 LDS.U16 R54, [R3+0x402] 36 0.24% 32 Shared Load 16 0.23%
131 0000e16f e3247020 LDS.U16 R50, [R2+0x10] 37 0.24% 32 Shared Load 16 0.27%
132 0000e16f e3247030 LDS.U16 R43, [R2+0x12] 38 0.24% 32 Shared Load 16 0.23%
133 0000e16f e3247040 LDS.U16 R46, [R3+0x410] 39 0.24% 32 Shared Load 16 0.27%
134 0000e16f e3247050 LDS.U16 R51, [R3+0x412] 40 0.24% 32 Shared Load 16 0.23%
135 0000e16f e3247060 LDS.U16 R41, [R2+0x20] 41 0.24% 32 Shared Load 16 0.27%
136 0000e16f e3247070 LDS.U16 R42, [R2+0x22] 42 0.24% 32 Shared Load 16 0.23%
137 0000e16f e3247080 LDS.U16 R38, [R3+0x420] 43 0.24% 32 Shared Load 16 0.27%
138 0000e16f e3247090 LDS.U16 R39, [R3+0x422] 44 0.24% 32 Shared Load 16 0.23%
139 0000e16f e32470a0 IMAD R48, R47, 0x10000, R48 44 0.24% 32
140 0000e16f e32470b0 LDS.U16 R36, [R2+0x30] 44 0.24% 32 Shared Load 16 0.27%
141 0000e16f e32470c0 LDS.U16 R37, [R2+0x32] 45 0.24% 32 Shared Load 16 0.23%
142 0000e16f e32470d0 PRMT R49, R49, 0x5410, R54 45 0.24% 32
143 0000e16f e32470e0 LDS.U16 R34, [R3+0x430] 45 0.24% 32 Shared Load 16 0.27%
144 0000e16f e32470f0 LDS.U16 R35, [R3+0x432] 46 0.24% 32 Shared Load 16 0.23%
145 0000e16f e3247100 IMAD R50, R43, 0x10000, R50 46 0.24% 32
146 0000e16f e3247110 LDS.U16 R8, [R2+0x40] 46 0.24% 32 Shared Load 16 0.27%
147 0000e16f e3247120 LDS.U16 R33, [R2+0x42] 47 0.24% 32 Shared Load 16 0.23%
148 0000e16f e3247130 PRMT R51, R46, 0x5410, R51 47 0.24% 32
149 0000e16f e3247140 LDS.U16 R9, [R3+0x440] 47 0.24% 32 Shared Load 16 0.27%
150 0000e16f e3247150 LDS.U16 R32, [R3+0x442] 48 0.24% 32 Shared Load 16 0.23%
151 0000e16f e3247160 LDS.U16 R10, [R2+0x50] 49 0.24% 32 Shared Load 16 0.27%
152 0000e16f e3247170 LDS.U16 R31, [R2+0x52] 50 0.24% 32 Shared Load 16 0.23%
153 0000e16f e3247180 LDS.U16 R11, [R3+0x450] 51 0.24% 32 Shared Load 16 0.27%
154 0000e16f e3247190 LDS.U16 R30, [R3+0x452] 52 0.24% 32 Shared Load 16 0.23%
155 0000e16f e32471a0 LDS.U16 R12, [R2+0x60] 53 0.24% 32 Shared Load 16 0.27%
156 0000e16f e32471b0 LDS.U16 R27, [R2+0x62] 54 0.24% 32 Shared Load 16 0.23%
157 0000e16f e32471c0 LDS.U16 R13, [R3+0x460] 55 0.24% 32 Shared Load 16 0.27%
158 0000e16f e32471d0 LDS.U16 R24, [R3+0x462] 56 0.24% 32 Shared Load 16 0.23%
159 0000e16f e32471e0 IMAD R8, R33, 0x10000, R8 56 0.24% 32
160 0000e16f e32471f0 LDS.U16 R14, [R2+0x70] 56 0.24% 32 Shared Load 16 0.27%
161 0000e16f e3247200 LDS.U16 R17, [R2+0x72] 57 0.24% 32 Shared Load 16 0.23%
162 0000e16f e3247210 PRMT R9, R9, 0x5410, R32 57 0.24% 32
163 0000e16f e3247220 LDS.U16 R15, [R3+0x470] 57 0.24% 32 Shared Load 16 0.27%
164 0000e16f e3247230 LDS.U16 R16, [R3+0x472] 58 0.24% 32 Shared Load 16 0.23%
165 0000e16f e3247240 IMAD R10, R31, 0x10000, R10 58 0.24% 32
166 0000e16f e3247250 BAR.SYNC.DEFER_BLOCKING 0x0 57 0.24% 32
167 0000e16f e3247260 PRMT R11, R11, 0x5410, R30 57 0.24% 32
168 0000e16f e3247270 IMAD R12, R27, 0x10000, R12 56 0.24% 32
169 0000e16f e3247280 PRMT R13, R13, 0x5410, R24 55 0.24% 32
170 0000e16f e3247290 STS.U16 [R45], R4 54 0.24% 32 Shared Store 16 0.04%
171 0000e16f e32472a0 STS.U16 [R44], R19 53 0.24% 32 Shared Store 16 0.04%
172 0000e16f e32472b0 IMAD R14, R17, 0x10000, R14 52 0.24% 32
173 0000e16f e32472c0 STS.U16 [R52], R5 51 0.24% 32 Shared Store 16 0.04%
174 0000e16f e32472d0 STS.U16 [R53], R55 50 0.24% 32 Shared Store 16 0.04%
175 0000e16f e32472e0 PRMT R15, R15, 0x5410, R16 49 0.24% 32
176 0000e16f e32472f0 STS.U16 [R69], R6 48 0.24% 32 Shared Store 16 0.04%
177 0000e16f e3247300 IMAD.SHL.U32 R5, R40, 0x2, RZ 48 0.24% 32
178 0000e16f e3247310 STS.U16 [R66], R56 47 0.24% 32 Shared Store 16 0.04%
179 0000e16f e3247320 LOP3.LUT R4, R5, R0, RZ, 0x3c, !PT 47 0.24% 32
180 0000e16f e3247330 STS.U16 [R68], R7 46 0.24% 32 Shared Store 16 0.04%
181 0000e16f e3247340 PRMT R5, R38, 0x5410, R39 46 0.24% 32
182 0000e16f e3247350 IMAD R6, R37, 0x10000, R36 45 0.24% 32
183 0000e16f e3247360 LOP3.LUT R65, R4, 0x1f0, R65, 0xf8, !PT 44 0.24% 32
184 0000e16f e3247370 STS.U16 [R67], R57 43 0.24% 32 Shared Store 16 0.04%
185 0000e16f e3247380 IMAD R4, R42, 0x10000, R41 43 0.24% 32
186 0000e16f e3247390 PRMT R37, R20, 0x7632, R37 41 0.24% 32
187 0000e16f e32473a0 BAR.SYNC.DEFER_BLOCKING 0x0 41 0.24% 32
188 0000e16f e32473b0 LEA R65, R65, UR4, 0x1 41 0.24% 32
189 0000e16f e32473c0 PRMT R7, R34, 0x5410, R35 42 0.24% 32
190 0000e16f e32473d0 LDS.U16 R28, [R2] 41 0.24% 32 Shared Load 16 0.27%
191 0000e16f e32473e0 LDS.U16 R29, [R2+0x2] 42 0.24% 32 Shared Load 16 0.23%
192 0000e16f e32473f0 LDS.U16 R25, [R3+0x400] 43 0.24% 32 Shared Load 16 0.27%
193 0000e16f e3247400 LDS.U16 R26, [R3+0x402] 44 0.24% 32 Shared Load 16 0.23%
194 0000e16f e3247410 LDS.U16 R18, [R2+0x10] 45 0.24% 32 Shared Load 16 0.27%
195 0000e16f e3247420 LDS.U16 R19, [R2+0x12] 46 0.24% 32 Shared Load 16 0.23%
196 0000e16f e3247430 LDS.U16 R58, [R3+0x412] 47 0.24% 32 Shared Load 16 0.23%
197 0000e16f e3247440 LDS.U16 R56, [R2+0x20] 48 0.24% 32 Shared Load 16 0.27%
198 0000e16f e3247450 LDS.U16 R57, [R2+0x22] 49 0.24% 32 Shared Load 16 0.23%
199 0000e16f e3247460 LDS.U16 R55, [R3+0x420] 50 0.24% 32 Shared Load 16 0.27%
200 0000e16f e3247470 LDS.U16 R54, [R3+0x422] 51 0.24% 32 Shared Load 16 0.23%
201 0000e16f e3247480 IMAD R16, R29, 0x10000, R28 52 0.24% 32
202 0000e16f e3247490 LDS.U16 R46, [R2+0x30] 51 0.24% 32 Shared Load 16 0.27%
203 0000e16f e32474a0 LDS.U16 R47, [R2+0x32] 52 0.24% 32 Shared Load 16 0.23%
204 0000e16f e32474b0 PRMT R17, R25, 0x5410, R26 53 0.24% 32
205 0000e16f e32474c0 LDS.U16 R42, [R3+0x430] 52 0.24% 32 Shared Load 16 0.27%
206 0000e16f e32474d0 LDS.U16 R43, [R3+0x432] 53 0.24% 32 Shared Load 16 0.23%
207 0000e16f e32474e0 IMAD R18, R19, 0x10000, R18 53 0.24% 32
208 0000e16f e32474f0 LDS.U16 R24, [R2+0x40] 53 0.24% 32 Shared Load 16 0.27%
209 0000e16f e3247500 LDS.U16 R19, [R3+0x410] 54 0.24% 32 Shared Load 16 0.27%
210 0000e16f e3247510 LDS.U16 R41, [R2+0x42] 55 0.24% 32 Shared Load 16 0.23%
211 0000e16f e3247520 LDS.U16 R25, [R3+0x440] 56 0.24% 32 Shared Load 16 0.27%
212 0000e16f e3247530 LDS.U16 R40, [R3+0x442] 57 0.24% 32 Shared Load 16 0.23%
213 0000e16f e3247540 LDS.U16 R26, [R2+0x50] 58 0.24% 32 Shared Load 16 0.27%
214 0000e16f e3247550 LDS.U16 R39, [R2+0x52] 59 0.24% 32 Shared Load 16 0.23%
215 0000e16f e3247560 LDS.U16 R27, [R3+0x450] 60 0.24% 32 Shared Load 16 0.27%
216 0000e16f e3247570 LDS.U16 R38, [R3+0x452] 61 0.24% 32 Shared Load 16 0.23%
217 0000e16f e3247580 LDS.U16 R28, [R2+0x60] 62 0.24% 32 Shared Load 16 0.27%
218 0000e16f e3247590 LDS.U16 R35, [R2+0x62] 63 0.24% 32 Shared Load 16 0.23%
219 0000e16f e32475a0 LDS.U16 R29, [R3+0x460] 64 0.24% 32 Shared Load 16 0.27%
220 0000e16f e32475b0 PRMT R19, R19, 0x5410, R58 64 0.24% 32
221 0000e16f e32475c0 LDS.U16 R34, [R3+0x462] 64 0.24% 32 Shared Load 16 0.23%
222 0000e16f e32475d0 IMAD R24, R41, 0x10000, R24 64 0.24% 32
223 0000e16f e32475e0 LDS.U16 R30, [R2+0x70] 64 0.24% 32 Shared Load 16 0.27%
224 0000e16f e32475f0 LDS.U16 R33, [R2+0x72] 65 0.24% 32 Shared Load 16 0.23%
225 0000e16f e3247600 PRMT R25, R25, 0x5410, R40 65 0.24% 32
226 0000e16f e3247610 LDS.U16 R31, [R3+0x470] 65 0.24% 32 Shared Load 16 0.27%
227 0000e16f e3247620 LDS.U16 R32, [R3+0x472] 66 0.24% 32 Shared Load 16 0.23%
228 0000e16f e3247630 IMAD R26, R39, 0x10000, R26 66 33.33% 0.24% 32
229 0000e16f e3247640 BAR.SYNC.DEFER_BLOCKING 0x0 65 0.24% 32
230 0000e16f e3247650 PRMT R27, R27, 0x5410, R38 65 0.24% 32
231 0000e16f e3247660 IMAD R28, R35, 0x10000, R28 64 0.24% 32
232 0000e16f e3247670 PRMT R29, R29, 0x5410, R34 63 0.24% 32
233 0000e16f e3247680 STS.U16 [R45], R20 62 0.24% 32 Shared Store 16 0.04%
234 0000e16f e3247690 STS.U16 [R44], R37 61 0.24% 32 Shared Store 16 0.04%
235 0000e16f e32476a0 IMAD R30, R33, 0x10000, R30 60 0.24% 32
236 0000e16f e32476b0 STS.U16 [R52], R21 59 0.24% 32 Shared Store 16 0.04%
237 0000e16f e32476c0 IMAD R20, R57, 0x10000, R56 60 0.24% 32
238 0000e16f e32476d0 STS.U16 [R53], R59 58 0.24% 32 Shared Store 16 0.04%
239 0000e16f e32476e0 PRMT R31, R31, 0x5410, R32 57 0.24% 32
240 0000e16f e32476f0 STS.U16 [R69], R22 56 0.24% 32 Shared Store 16 0.04%
241 0000e16f e3247700 PRMT R21, R23, 0x7632, R21 55 0.24% 32
242 0000e16f e3247710 STS.U16 [R66], R64 55 0.24% 32 Shared Store 16 0.04%
243 0000e16f e3247720 STS.U16 [R68], R23 54 0.24% 32 Shared Store 16 0.04%
244 0000e16f e3247730 IMAD R22, R47, 0x10000, R46 54 0.24% 32
245 0000e16f e3247740 STS.U16 [R67], R21 52 0.24% 32 Shared Store 16 0.04%
246 0000e16f e3247750 BAR.SYNC.DEFER_BLOCKING 0x0 51 0.24% 32
247 0000e16f e3247760 PRMT R23, R42, 0x5410, R43 52 0.24% 32
248 0000e16f e3247770 PRMT R21, R55, 0x5410, R54 51 0.24% 32
249 0000e16f e3247780 LDS.U16 R36, [R2] 50 0.24% 32 Shared Load 16 0.27%
250 0000e16f e3247790 LDS.U16 R37, [R2+0x2] 51 0.24% 32 Shared Load 16 0.23%
251 0000e16f e32477a0 LDS.U16 R33, [R3+0x400] 52 0.24% 32 Shared Load 16 0.27%
252 0000e16f e32477b0 LDS.U16 R34, [R3+0x402] 53 0.24% 32 Shared Load 16 0.23%
253 0000e16f e32477c0 LDS.U16 R35, [R2+0x10] 54 0.24% 32 Shared Load 16 0.27%
254 0000e16f e32477d0 LDS.U16 R39, [R3+0x430] 55 0.24% 32 Shared Load 16 0.27%
255 0000e16f e32477e0 LDS.U16 R42, [R3+0x432] 56 0.24% 32 Shared Load 16 0.23%
256 0000e16f e32477f0 LDS.U16 R40, [R2+0x40] 57 0.24% 32 Shared Load 16 0.27%
257 0000e16f e3247800 LDS.U16 R41, [R2+0x42] 58 0.24% 32 Shared Load 16 0.23%
258 0000e16f e3247810 LDS.U16 R46, [R3+0x412] 59 0.24% 32 Shared Load 16 0.23%
259 0000e16f e3247820 LDS.U16 R38, [R2+0x30] 60 0.24% 32 Shared Load 16 0.27%
260 0000e16f e3247830 IMAD R32, R37, 0x10000, R36 61 0.24% 32
261 0000e16f e3247840 LDS.U16 R43, [R2+0x32] 60 0.24% 32 Shared Load 16 0.23%
262 0000e16f e3247850 LDS.U16 R36, [R2+0x12] 61 0.24% 32 Shared Load 16 0.23%
263 0000e16f e3247860 PRMT R33, R33, 0x5410, R34 61 0.24% 32
264 0000e16f e3247870 LDS.U16 R37, [R3+0x410] 61 0.24% 32 Shared Load 16 0.27%
265 0000e16f e3247880 LDS.U16 R47, [R2+0x20] 62 0.24% 32 Shared Load 16 0.27%
266 0000e16f e3247890 LDS.U16 R54, [R2+0x22] 63 0.24% 32 Shared Load 16 0.23%
267 0000e16f e32478a0 PRMT R39, R39, 0x5410, R42 63 0.24% 32
268 0000e16f e32478b0 LDS.U16 R55, [R3+0x420] 63 0.24% 32 Shared Load 16 0.27%
269 0000e16f e32478c0 LDS.U16 R42, [R3+0x442] 64 0.24% 32 Shared Load 16 0.23%
270 0000e16f e32478d0 IMAD R40, R41, 0x10000, R40 64 0.24% 32
271 0000e16f e32478e0 LDS.U16 R56, [R3+0x422] 64 0.24% 32 Shared Load 16 0.23%
272 0000e16f e32478f0 LDS.U16 R41, [R3+0x440] 65 0.24% 32 Shared Load 16 0.27%
273 0000e16f e3247900 LDS.U16 R57, [R3+0x460] 66 0.24% 32 Shared Load 16 0.27%
274 0000e16f e3247910 IMAD R38, R43, 0x10000, R38 66 0.24% 32
275 0000e16f e3247920 LDS.U16 R58, [R2+0x70] 66 0.24% 32 Shared Load 16 0.27%
276 0000e16f e3247930 IMAD R34, R36, 0x10000, R35 67 0.24% 32
277 0000e16f e3247940 LDS.U16 R43, [R2+0x50] 66 0.24% 32 Shared Load 16 0.27%
278 0000e16f e3247950 PRMT R35, R37, 0x5410, R46 67 0.24% 32
279 0000e16f e3247960 LDS.U16 R64, [R3+0x470] 66 0.24% 32 Shared Load 16 0.27%
280 0000e16f e3247970 LDS.U16 R46, [R2+0x52] 67 0.24% 32 Shared Load 16 0.23%
281 0000e16f e3247980 IMAD R36, R54, 0x10000, R47 68 0.24% 32
282 0000e16f e3247990 LDS.U16 R59, [R3+0x472] 67 0.24% 32 Shared Load 16 0.23%
283 0000e16f e32479a0 LDS.U16 R47, [R3+0x450] 68 0.24% 32 Shared Load 16 0.27%
284 0000e16f e32479b0 LDS.U16 R54, [R3+0x452] 69 0.24% 32 Shared Load 16 0.23%
285 0000e16f e32479c0 PRMT R37, R55, 0x5410, R56 70 0.24% 32
286 0000e16f e32479d0 LDS.U16 R56, [R2+0x60] 69 0.24% 32 Shared Load 16 0.27%
287 0000e16f e32479e0 PRMT R41, R41, 0x5410, R42 69 0.24% 32
288 0000e16f e32479f0 LDS.U16 R55, [R2+0x62] 69 0.24% 32 Shared Load 16 0.23%
289 0000e16f e3247a00 IMAD R42, R46, 0x10000, R43 70 0.24% 32
290 0000e16f e3247a10 LDS.U16 R46, [R3+0x462] 69 0.24% 32 Shared Load 16 0.23%
291 0000e16f e3247a20 PRMT R59, R64, 0x5410, R59 69 0.24% 32
292 0000e16f e3247a30 PRMT R43, R47, 0x5410, R54 69 0.24% 32
293 0000e16f e3247a40 LDS.U16 R47, [R2+0x72] 69 0.24% 32 Shared Load 16 0.23%
294 0000e16f e3247a50 PRMT R54, R61, 0x7632, R54 69 0.24% 32
295 0000e16f e3247a60 BAR.SYNC.DEFER_BLOCKING 0x0 69 0.24% 32
296 0000e16f e3247a70 IMAD R56, R55, 0x10000, R56 69 0.24% 32
297 0000e16f e3247a80 PRMT R55, R63, 0x7632, R55 69 0.24% 32
298 0000e16f e3247a90 PRMT R57, R57, 0x5410, R46 69 0.24% 32
299 0000e16f e3247aa0 STS.U16 [R45], R60 69 0.24% 32 Shared Store 16 0.04%
300 0000e16f e3247ab0 PRMT R46, R60, 0x7632, R46 68 0.24% 32
301 0000e16f e3247ac0 STS.U16 [R44], R46 67 0.24% 32 Shared Store 16 0.04%
302 0000e16f e3247ad0 STS.U16 [R52], R61 65 0.24% 32 Shared Store 16 0.04%
303 0000e16f e3247ae0 IMAD R58, R47, 0x10000, R58 63 0.24% 32
304 0000e16f e3247af0 PRMT R47, R62, 0x7632, R47 63 0.24% 32
305 0000e16f e3247b00 STS.U16 [R53], R54 63 0.24% 32 Shared Store 16 0.04%
306 0000e16f e3247b10 STS.U16 [R69], R62 61 0.24% 32 Shared Store 16 0.04%
307 0000e16f e3247b20 STS.U16 [R66], R47 59 0.24% 32 Shared Store 16 0.04%
308 0000e16f e3247b30 STS.U16 [R68], R63 57 0.24% 32 Shared Store 16 0.04%
309 0000e16f e3247b40 STS.U16 [R67], R55 55 0.24% 32 Shared Store 16 0.04%
310 0000e16f e3247b50 BAR.SYNC.DEFER_BLOCKING 0x0 53 0.24% 32
311 0000e16f e3247b60 LDSM.16.MT88.4 R44, [R65] 57 0.24% 32 Shared Load 128
312 0000e16f e3247b70 LDSM.16.MT88.4 R52, [R65+0x400] 61 0.24% 32 Shared Load 128
313 0000e16f e3247b80 HMMA.16816.F32 R48, R48, R44, RZ 61 0.24% 32
314 0000e16f e3247b90 NOP 59 0.24% 32
315 0000e16f e3247ba0 HMMA.16816.F32 R4, R4, R46, R48 59 0.24% 32
316 0000e16f e3247bb0 NOP 53 0.24% 32
317 0000e16f e3247bc0 HMMA.16816.F32 R4, R8, R52, R4 53 0.24% 32
318 0000e16f e3247bd0 LDSM.16.MT88.4 R8, [R65+0x800] 51 0.24% 32 Shared Load 128
319 0000e16f e3247be0 NOP 51 0.24% 32
320 0000e16f e3247bf0 HMMA.16816.F32 R4, R12, R54, R4 51 0.24% 32
321 0000e16f e3247c00 LDSM.16.MT88.4 R12, [R65+0x1000] 49 0.24% 32 Shared Load 128
322 0000e16f e3247c10 NOP 49 0.24% 32
323 0000e16f e3247c20 HMMA.16816.F32 R16, R16, R8, R4 49 0.24% 32
324 0000e16f e3247c30 LDSM.16.MT88.4 R4, [R65+0xc00] 47 0.24% 32 Shared Load 128
325 0000e16f e3247c40 NOP 47 0.24% 32
326 0000e16f e3247c50 HMMA.16816.F32 R16, R20, R10, R16 47 0.24% 32
327 0000e16f e3247c60 LDSM.16.MT88.4 R8, [R65+0x1400] 45 0.24% 32 Shared Load 128
328 0000e16f e3247c70 LDS.U16 R22, [R2+0x30] 46 0.24% 32 Shared Load 16 0.27%
329 0000e16f e3247c80 LDS.U16 R23, [R2+0x32] 47 0.24% 32 Shared Load 16 0.23%
330 0000e16f e3247c90 LDS.U16 R20, [R3+0x420] 48 0.24% 32 Shared Load 16 0.27%
331 0000e16f e3247ca0 LDS.U16 R21, [R3+0x422] 49 0.24% 32 Shared Load 16 0.23%
332 0000e16f e3247cb0 HMMA.16816.F32 R16, R24, R4, R16 49 0.24% 32
333 0000e16f e3247cc0 LDS.U16 R24, [R3+0x430] 44 0.24% 32 Shared Load 16 0.27%
334 0000e16f e3247cd0 LDS.U16 R25, [R3+0x432] 45 0.24% 32 Shared Load 16 0.23%
335 0000e16f e3247ce0 LDS.U16 R26, [R2+0x50] 46 0.24% 32 Shared Load 16 0.27%
336 0000e16f e3247cf0 LDS.U16 R27, [R2+0x52] 47 0.24% 32 Shared Load 16 0.23%
337 0000e16f e3247d00 HMMA.16816.F32 R16, R28, R6, R16 47 0.24% 32
338 0000e16f e3247d10 LDSM.16.MT88.4 R4, [R65+0x1800] 45 0.24% 32 Shared Load 128
339 0000e16f e3247d20 LDS.U16 R28, [R3+0x450] 46 0.24% 32 Shared Load 16 0.27%
340 0000e16f e3247d30 LDS.U16 R29, [R3+0x452] 47 0.24% 32 Shared Load 16 0.23%
341 0000e16f e3247d40 LDSM.16.MT88.4 R64, [R65+0x1c00] 50 0.24% 32 Shared Load 128
342 0000e16f e3247d50 HMMA.16816.F32 R16, R32, R12, R16 50 0.24% 32
343 0000e16f e3247d60 LDS.U16 R12, [R3+0x400] 45 0.24% 32 Shared Load 16 0.27%
344 0000e16f e3247d70 LDS.U16 R13, [R3+0x402] 46 0.24% 32 Shared Load 16 0.23%
345 0000e16f e3247d80 NOP 46 0.24% 32
346 0000e16f e3247d90 HMMA.16816.F32 R16, R36, R14, R16 46 0.24% 32
347 0000e16f e3247da0 LDS.U16 R14, [R2+0x10] 41 0.24% 32 Shared Load 16 0.27%
348 0000e16f e3247db0 LDS.U16 R15, [R2+0x12] 42 0.24% 32 Shared Load 16 0.23%
349 0000e16f e3247dc0 NOP 42 0.24% 32
350 0000e16f e3247dd0 HMMA.16816.F32 R40, R40, R8, R16 42 0.24% 32
351 0000e16f e3247de0 LDS.U16 R8, [R2] 37 0.24% 32 Shared Load 16 0.27%
352 0000e16f e3247df0 LDS.U16 R9, [R2+0x2] 38 0.24% 32 Shared Load 16 0.23%
353 0000e16f e3247e00 LDS.U16 R16, [R3+0x410] 39 0.24% 32 Shared Load 16 0.27%
354 0000e16f e3247e10 LDS.U16 R17, [R3+0x412] 40 0.24% 32 Shared Load 16 0.23%
355 0000e16f e3247e20 LDS.U16 R18, [R2+0x20] 41 0.24% 32 Shared Load 16 0.27%
356 0000e16f e3247e30 LDS.U16 R19, [R2+0x22] 42 0.24% 32 Shared Load 16 0.23%
357 0000e16f e3247e40 HMMA.16816.F32 R40, R56, R10, R40 42 0.24% 32
358 0000e16f e3247e50 IMAD R10, R15, 0x10000, R14 37 33.33% 0.24% 32
359 0000e16f e3247e60 PRMT R15, R24, 0x5410, R25 36 0.24% 32
360 0000e16f e3247e70 IMAD R14, R23, 0x10000, R22 35 0.24% 32
361 0000e16f e3247e80 S2R R22, SR_TID.X 34 0.24% 32
362 0000e16f e3247e90 IMAD R8, R9, 0x10000, R8 34 0.24% 32
363 0000e16f e3247ea0 PRMT R9, R12, 0x5410, R13 34 0.24% 32
364 0000e16f e3247eb0 PRMT R13, R20, 0x5410, R21 33 0.24% 32
365 0000e16f e3247ec0 LDS.U16 R20, [R3+0x470] 32 0.24% 32 Shared Load 16 0.27%
366 0000e16f e3247ed0 PRMT R11, R16, 0x5410, R17 33 0.24% 32
367 0000e16f e3247ee0 LDS.U16 R21, [R3+0x472] 32 0.24% 32 Shared Load 16 0.23%
368 0000e16f e3247ef0 LDS.U16 R16, [R3+0x440] 33 0.24% 32 Shared Load 16 0.27%
369 0000e16f e3247f00 HMMA.16816.F32 R8, R8, R4, R40 33 0.24% 32
370 0000e16f e3247f10 IMAD R12, R19, 0x10000, R18 28 0.24% 32
371 0000e16f e3247f20 LDS.U16 R4, [R2+0x40] 27 0.24% 32 Shared Load 16 0.27%
372 0000e16f e3247f30 LDS.U16 R5, [R2+0x42] 28 0.24% 32 Shared Load 16 0.23%
373 0000e16f e3247f40 LDS.U16 R17, [R3+0x442] 29 0.24% 32 Shared Load 16 0.23%
374 0000e16f e3247f50 LDS.U16 R18, [R3+0x462] 30 0.24% 32 Shared Load 16 0.23%
375 0000e16f e3247f60 LDS.U16 R19, [R2+0x72] 31 0.24% 32 Shared Load 16 0.23%
376 0000e16f e3247f70 HMMA.16816.F32 R8, R12, R6, R8 31 0.24% 32
377 0000e16f e3247f80 LDS.U16 R12, [R2+0x60] 26 0.24% 32 Shared Load 16 0.27%
378 0000e16f e3247f90 LDS.U16 R13, [R2+0x62] 27 0.24% 32 Shared Load 16 0.23%
379 0000e16f e3247fa0 IMAD R6, R27, 0x10000, R26 28 0.24% 32
380 0000e16f e3247fb0 PRMT R7, R28, 0x5410, R29 27 0.24% 32
381 0000e16f e3247fc0 LDS.U16 R15, [R3+0x460] 26 0.24% 32 Shared Load 16 0.27%
382 0000e16f e3247fd0 LDS.U16 R14, [R2+0x70] 26 0.24% 32 Shared Load 16 0.27%
383 0000e16f e3247fe0 IMAD.SHL.U32 R2, R22, 0x4, RZ 26 0.24% 32
384 0000e16f e3247ff0 IMAD R4, R5, 0x10000, R4 26 0.24% 32
385 0000e16f e3248000 PRMT R5, R16, 0x5410, R17 26 0.24% 32
386 0000e16f e3248010 LOP3.LUT R2, R2, 0x70, RZ, 0xc0, !PT 24 0.24% 32
387 0000e16f e3248020 HMMA.16816.F32 R4, R4, R64, R8 24 0.24% 32
388 0000e16f e3248030 IMAD R12, R13, 0x10000, R12 18 0.24% 32
389 0000e16f e3248040 PRMT R13, R15, 0x5410, R18 18 0.24% 32
390 0000e16f e3248050 PRMT R15, R20, 0x5410, R21 17 0.24% 32
391 0000e16f e3248060 IMAD R14, R19, 0x10000, R14 15 0.24% 32
392 0000e16f e3248070 NOP 14 0.24% 32
393 0000e16f e3248080 HMMA.16816.F32 R64, R12, R66, R4 16 0.24% 32
394 0000e16f e3248090 IMAD.SHL.U32 R5, R22, 0x2, RZ 9 0.24% 32
395 0000e16f e32480a0 LOP3.LUT R4, R2, 0x80, RZ, 0xfc, !PT 10 0.24% 32
396 0000e16f e32480b0 LOP3.LUT R22, R22, 0x4, RZ, 0xc0, !PT 10 0.24% 32
397 0000e16f e32480c0 LOP3.LUT R3, R2, 0x6, R5, 0xf8, !PT 11 0.24% 32
398 0000e16f e32480d0 LEA.HI R2, R2, UR4, RZ, 0x1f 11 0.24% 32
399 0000e16f e32480e0 IMAD.SHL.U32 R22, R22, 0x2, RZ 11 0.24% 32
400 0000e16f e32480f0 LOP3.LUT R3, R3, R0, RZ, 0xfc, !PT 11 0.24% 32
401 0000e16f e3248100 LOP3.LUT R0, R5, 0xf0, RZ, 0xc0, !PT 11 0.24% 32
402 0000e16f e3248110 LEA.HI R4, R4, UR4, RZ, 0x1f 11 0.24% 32
403 0000e16f e3248120 LEA.HI R6, R0, UR4, RZ, 0x1f 12 0.24% 32
404 0000e16f e3248130 IMAD R0, R3, 0x4, R2 12 0.24% 32
405 0000e16f e3248140 BAR.SYNC.DEFER_BLOCKING 0x0 11 0.24% 32
406 0000e16f e3248150 IMAD R4, R3, 0x4, R4 11 0.24% 32
407 0000e16f e3248160 LOP3.LUT R22, R22, 0x6, R5, 0xf8, !PT 10 0.24% 32
408 0000e16f e3248170 LOP3.LUT R5, R22, 0xf0, R5, 0xf8, !PT 10 0.24% 32
409 0000e16f e3248180 LDC.64 R2, c[0x0][0x218] 11 0.24% 32
410 0000e16f e3248190 IMAD R6, R5, 0x4, R6 11 0.24% 32
411 0000e16f e32481a0 STS.64 [R0], R64 11 0.24% 32 Shared Store 64 0.08%
412 0000e16f e32481b0 STS.64 [R4+0x200], R66 8 0.24% 32 Shared Store 64 0.08%
413 0000e16f e32481c0 IMAD.WIDE.U32 R2, R5, 0x4, R2 5 0.24% 32
414 0000e16f e32481d0 BAR.SYNC.DEFER_BLOCKING 0x0 4 0.24% 32
415 0000e16f e32481e0 LDS.64 R6, [R6] 5 0.24% 32 Shared Load 64 0.08%
416 0000e16f e32481f0 STG.E.64 desc[UR6][R2.64], R6 5 0.24% 32 Global Store 64
417 0000e16f e3248200 EXIT 1 0.24% 32
418 0000e16f e3248210 BRA 0xe16fe3248210 1
419 0000e16f e3248220 NOP
420 0000e16f e3248230 NOP
421 0000e16f e3248240 NOP
422 0000e16f e3248250 NOP
423 0000e16f e3248260 NOP
424 0000e16f e3248270 NOP
425 0000e16f e3248280 NOP
426 0000e16f e3248290 NOP
427 0000e16f e32482a0 NOP
428 0000e16f e32482b0 NOP
429 0000e16f e32482c0 NOP
430 0000e16f e32482d0 NOP
431 0000e16f e32482e0 NOP
432 0000e16f e32482f0 NOP
Environment details
Triton: main
GPU: GH200