Skip to content
This repository was archived by the owner on Oct 11, 2024. It is now read-only.

Commit 4d5e0b9

Browse files
divakar-amdRobert Shaw
authored and
Robert Shaw
committed
[Kernel][ROCm][AMD] fused_moe Triton configs v2 for mi300X (vllm-project#5932)
1 parent 209a147 commit 4d5e0b9

4 files changed

+500
-194
lines changed
Lines changed: 118 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,128 +1,200 @@
11
{
22
"1": {
33
"BLOCK_SIZE_M": 16,
4-
"BLOCK_SIZE_N": 64,
5-
"BLOCK_SIZE_K": 128,
4+
"BLOCK_SIZE_N": 32,
5+
"BLOCK_SIZE_K": 256,
66
"GROUP_SIZE_M": 1,
7-
"num_stages": 0
7+
"num_warps": 2,
8+
"num_stages": 0,
9+
"waves_per_eu": 0,
10+
"matrix_instr_nonkdim": 16,
11+
"kpack": 1
812
},
913
"2": {
1014
"BLOCK_SIZE_M": 16,
11-
"BLOCK_SIZE_N": 64,
15+
"BLOCK_SIZE_N": 16,
1216
"BLOCK_SIZE_K": 128,
1317
"GROUP_SIZE_M": 1,
14-
"num_stages": 0
18+
"num_warps": 2,
19+
"num_stages": 0,
20+
"waves_per_eu": 0,
21+
"matrix_instr_nonkdim": 16,
22+
"kpack": 2
1523
},
1624
"4": {
1725
"BLOCK_SIZE_M": 16,
18-
"BLOCK_SIZE_N": 64,
26+
"BLOCK_SIZE_N": 32,
1927
"BLOCK_SIZE_K": 256,
20-
"GROUP_SIZE_M": 64,
21-
"num_stages": 1
28+
"GROUP_SIZE_M": 1,
29+
"num_warps": 2,
30+
"num_stages": 0,
31+
"waves_per_eu": 0,
32+
"matrix_instr_nonkdim": 16,
33+
"kpack": 2
2234
},
2335
"8": {
2436
"BLOCK_SIZE_M": 16,
25-
"BLOCK_SIZE_N": 64,
37+
"BLOCK_SIZE_N": 16,
2638
"BLOCK_SIZE_K": 256,
27-
"GROUP_SIZE_M": 32,
28-
"num_stages": 1
39+
"GROUP_SIZE_M": 1,
40+
"num_warps": 1,
41+
"num_stages": 0,
42+
"waves_per_eu": 0,
43+
"matrix_instr_nonkdim": 16,
44+
"kpack": 2
2945
},
3046
"16": {
3147
"BLOCK_SIZE_M": 16,
32-
"BLOCK_SIZE_N": 64,
48+
"BLOCK_SIZE_N": 16,
3349
"BLOCK_SIZE_K": 256,
34-
"GROUP_SIZE_M": 8,
35-
"num_stages": 1
50+
"GROUP_SIZE_M": 1,
51+
"num_warps": 4,
52+
"num_stages": 0,
53+
"waves_per_eu": 0,
54+
"matrix_instr_nonkdim": 16,
55+
"kpack": 2
3656
},
3757
"24": {
3858
"BLOCK_SIZE_M": 16,
39-
"BLOCK_SIZE_N": 64,
40-
"BLOCK_SIZE_K": 256,
41-
"GROUP_SIZE_M": 64,
42-
"num_stages": 1
59+
"BLOCK_SIZE_N": 32,
60+
"BLOCK_SIZE_K": 64,
61+
"GROUP_SIZE_M": 1,
62+
"num_warps": 1,
63+
"num_stages": 0,
64+
"waves_per_eu": 0,
65+
"matrix_instr_nonkdim": 16,
66+
"kpack": 2
4367
},
4468
"32": {
4569
"BLOCK_SIZE_M": 16,
46-
"BLOCK_SIZE_N": 128,
47-
"BLOCK_SIZE_K": 256,
48-
"GROUP_SIZE_M": 8,
49-
"num_stages": 1
70+
"BLOCK_SIZE_N": 16,
71+
"BLOCK_SIZE_K": 128,
72+
"GROUP_SIZE_M": 4,
73+
"num_warps": 2,
74+
"num_stages": 0,
75+
"waves_per_eu": 0,
76+
"matrix_instr_nonkdim": 16,
77+
"kpack": 1
5078
},
5179
"48": {
5280
"BLOCK_SIZE_M": 16,
53-
"BLOCK_SIZE_N": 64,
81+
"BLOCK_SIZE_N": 16,
5482
"BLOCK_SIZE_K": 128,
55-
"GROUP_SIZE_M": 8,
56-
"num_stages": 0
83+
"GROUP_SIZE_M": 4,
84+
"num_warps": 2,
85+
"num_stages": 0,
86+
"waves_per_eu": 0,
87+
"matrix_instr_nonkdim": 16,
88+
"kpack": 2
5789
},
5890
"64": {
59-
"BLOCK_SIZE_M": 64,
91+
"BLOCK_SIZE_M": 32,
6092
"BLOCK_SIZE_N": 64,
6193
"BLOCK_SIZE_K": 128,
62-
"GROUP_SIZE_M": 8,
63-
"num_stages": 0
94+
"GROUP_SIZE_M": 4,
95+
"num_warps": 8,
96+
"num_stages": 0,
97+
"waves_per_eu": 0,
98+
"matrix_instr_nonkdim": 16,
99+
"kpack": 2
64100
},
65101
"96": {
66102
"BLOCK_SIZE_M": 32,
67-
"BLOCK_SIZE_N": 128,
103+
"BLOCK_SIZE_N": 32,
68104
"BLOCK_SIZE_K": 128,
69-
"GROUP_SIZE_M": 16,
70-
"num_stages": 0
105+
"GROUP_SIZE_M": 4,
106+
"num_warps": 4,
107+
"num_stages": 0,
108+
"waves_per_eu": 0,
109+
"matrix_instr_nonkdim": 16,
110+
"kpack": 2
71111
},
72112
"128": {
73113
"BLOCK_SIZE_M": 64,
74114
"BLOCK_SIZE_N": 64,
75-
"BLOCK_SIZE_K": 128,
76-
"GROUP_SIZE_M": 8,
77-
"num_stages": 0
115+
"BLOCK_SIZE_K": 64,
116+
"GROUP_SIZE_M": 4,
117+
"num_warps": 8,
118+
"num_stages": 0,
119+
"waves_per_eu": 0,
120+
"matrix_instr_nonkdim": 16,
121+
"kpack": 2
78122
},
79123
"256": {
80124
"BLOCK_SIZE_M": 128,
81125
"BLOCK_SIZE_N": 128,
82126
"BLOCK_SIZE_K": 64,
83-
"GROUP_SIZE_M": 8,
84-
"num_stages": 0
127+
"GROUP_SIZE_M": 4,
128+
"num_warps": 8,
129+
"num_stages": 0,
130+
"waves_per_eu": 0,
131+
"matrix_instr_nonkdim": 16,
132+
"kpack": 1
85133
},
86134
"512": {
87-
"BLOCK_SIZE_M": 256,
135+
"BLOCK_SIZE_M": 128,
88136
"BLOCK_SIZE_N": 128,
89137
"BLOCK_SIZE_K": 64,
90-
"GROUP_SIZE_M": 8,
91-
"num_stages": 0
138+
"GROUP_SIZE_M": 4,
139+
"num_warps": 8,
140+
"num_stages": 0,
141+
"waves_per_eu": 0,
142+
"matrix_instr_nonkdim": 16,
143+
"kpack": 2
92144
},
93145
"1024": {
94146
"BLOCK_SIZE_M": 128,
95147
"BLOCK_SIZE_N": 128,
96148
"BLOCK_SIZE_K": 64,
97149
"GROUP_SIZE_M": 1,
98-
"num_stages": 0
150+
"num_warps": 8,
151+
"num_stages": 0,
152+
"waves_per_eu": 0,
153+
"matrix_instr_nonkdim": 32,
154+
"kpack": 2
99155
},
100156
"1536": {
101157
"BLOCK_SIZE_M": 128,
102158
"BLOCK_SIZE_N": 128,
103159
"BLOCK_SIZE_K": 64,
104160
"GROUP_SIZE_M": 1,
105-
"num_stages": 0
161+
"num_warps": 8,
162+
"num_stages": 0,
163+
"waves_per_eu": 0,
164+
"matrix_instr_nonkdim": 16,
165+
"kpack": 2
106166
},
107167
"2048": {
108168
"BLOCK_SIZE_M": 128,
109-
"BLOCK_SIZE_N": 256,
169+
"BLOCK_SIZE_N": 128,
110170
"BLOCK_SIZE_K": 64,
111171
"GROUP_SIZE_M": 1,
112-
"num_stages": 0
172+
"num_warps": 8,
173+
"num_stages": 0,
174+
"waves_per_eu": 0,
175+
"matrix_instr_nonkdim": 16,
176+
"kpack": 2
113177
},
114178
"3072": {
115179
"BLOCK_SIZE_M": 128,
116-
"BLOCK_SIZE_N": 256,
180+
"BLOCK_SIZE_N": 128,
117181
"BLOCK_SIZE_K": 64,
118182
"GROUP_SIZE_M": 1,
119-
"num_stages": 0
183+
"num_warps": 8,
184+
"num_stages": 0,
185+
"waves_per_eu": 0,
186+
"matrix_instr_nonkdim": 16,
187+
"kpack": 1
120188
},
121189
"4096": {
122190
"BLOCK_SIZE_M": 128,
123191
"BLOCK_SIZE_N": 128,
124192
"BLOCK_SIZE_K": 64,
125193
"GROUP_SIZE_M": 1,
126-
"num_stages": 0
194+
"num_warps": 8,
195+
"num_stages": 0,
196+
"waves_per_eu": 0,
197+
"matrix_instr_nonkdim": 16,
198+
"kpack": 1
127199
}
128200
}

0 commit comments

Comments
 (0)