-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathacl_run_avx512x8.h
253 lines (213 loc) · 5.78 KB
/
acl_run_avx512x8.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2020 Intel Corporation
*/
/*
* Defines required by "acl_run_avx512_common.h".
* Note that all of them has to be undefined by the end
* of this file, as "acl_run_avx512_common.h" can be included several
* times from different *.h files for the same *.c.
*/
/*
* This implementation uses 256-bit registers(ymm) and intrinsics.
* So our main SIMD type is 256-bit width and each such variable can
* process sizeof(__m256i) / sizeof(uint32_t) == 8 entries in parallel.
*/
#define _T_simd __m256i
#define _T_mask __mmask8
/* Naming convention for static const variables. */
#define _SC_(x) ymm_##x
#define _SV_(x) (ymm_##x.y)
/* Naming convention for internal functions. */
#define _F_(x) x##_avx512x8
/*
* Same intrinsics have different syntaxes (depending on the bit-width),
* so to overcome that few macros need to be defined.
*/
/* Naming convention for generic epi(packed integers) type intrinsics. */
#define _M_I_(x) _mm256_##x
/* Naming convention for si(whole simd integer) type intrinsics. */
#define _M_SI_(x) _mm256_##x##_si256
/* Naming convention for masked gather type intrinsics. */
#define _M_MGI_(x) _mm256_m##x
/* Naming convention for gather type intrinsics. */
#define _M_GI_(name, idx, base, scale) _mm256_##name(base, idx, scale)
/* num/mask of transitions per SIMD regs */
#define _SIMD_MASK_BIT_ (sizeof(_T_simd) / sizeof(uint32_t))
#define _SIMD_MASK_MAX_ RTE_LEN2MASK(_SIMD_MASK_BIT_, uint32_t)
#define _SIMD_FLOW_NUM_ (2 * _SIMD_MASK_BIT_)
#define _SIMD_FLOW_MSK_ (_SIMD_FLOW_NUM_ - 1)
/* num/mask of pointers per SIMD regs */
#define _SIMD_PTR_NUM_ (sizeof(_T_simd) / sizeof(uintptr_t))
#define _SIMD_PTR_MSK_ RTE_LEN2MASK(_SIMD_PTR_NUM_, uint32_t)
static const rte_ymm_t _SC_(match_mask) = {
.u32 = {
RTE_ACL_NODE_MATCH,
RTE_ACL_NODE_MATCH,
RTE_ACL_NODE_MATCH,
RTE_ACL_NODE_MATCH,
RTE_ACL_NODE_MATCH,
RTE_ACL_NODE_MATCH,
RTE_ACL_NODE_MATCH,
RTE_ACL_NODE_MATCH,
},
};
static const rte_ymm_t _SC_(index_mask) = {
.u32 = {
RTE_ACL_NODE_INDEX,
RTE_ACL_NODE_INDEX,
RTE_ACL_NODE_INDEX,
RTE_ACL_NODE_INDEX,
RTE_ACL_NODE_INDEX,
RTE_ACL_NODE_INDEX,
RTE_ACL_NODE_INDEX,
RTE_ACL_NODE_INDEX,
},
};
static const rte_ymm_t _SC_(trlo_idle) = {
.u32 = {
RTE_ACL_IDLE_NODE,
RTE_ACL_IDLE_NODE,
RTE_ACL_IDLE_NODE,
RTE_ACL_IDLE_NODE,
RTE_ACL_IDLE_NODE,
RTE_ACL_IDLE_NODE,
RTE_ACL_IDLE_NODE,
RTE_ACL_IDLE_NODE,
},
};
static const rte_ymm_t _SC_(trhi_idle) = {
.u32 = {
0, 0, 0, 0,
0, 0, 0, 0,
},
};
static const rte_ymm_t _SC_(shuffle_input) = {
.u32 = {
0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
},
};
static const rte_ymm_t _SC_(four_32) = {
.u32 = {
4, 4, 4, 4,
4, 4, 4, 4,
},
};
static const rte_ymm_t _SC_(idx_add) = {
.u32 = {
0, 1, 2, 3,
4, 5, 6, 7,
},
};
static const rte_ymm_t _SC_(range_base) = {
.u32 = {
0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
},
};
static const rte_ymm_t _SC_(pminp) = {
.u32 = {
0x00, 0x01, 0x02, 0x03,
0x08, 0x09, 0x0a, 0x0b,
},
};
static const __mmask16 _SC_(pmidx_msk) = 0x55;
static const rte_ymm_t _SC_(pmidx[2]) = {
[0] = {
.u32 = {
0, 0, 1, 0, 2, 0, 3, 0,
},
},
[1] = {
.u32 = {
4, 0, 5, 0, 6, 0, 7, 0,
},
},
};
/*
* unfortunately current AVX512 ISA doesn't provide ability for
* gather load on a byte quantity. So we have to mimic it in SW,
* by doing 4x1B scalar loads.
*/
static inline __m128i
_m256_mask_gather_epi8x4(__m256i pdata, __mmask8 mask)
{
rte_xmm_t v;
rte_ymm_t p;
static const uint32_t zero;
p.y = _mm256_mask_set1_epi64(pdata, mask ^ _SIMD_PTR_MSK_,
(uintptr_t)&zero);
v.u32[0] = *(uint8_t *)p.u64[0];
v.u32[1] = *(uint8_t *)p.u64[1];
v.u32[2] = *(uint8_t *)p.u64[2];
v.u32[3] = *(uint8_t *)p.u64[3];
return v.x;
}
/*
* Gather 4/1 input bytes for up to 8 (2*8) locations in parallel.
*/
static __rte_always_inline __m256i
_F_(gather_bytes)(__m256i zero, const __m256i p[2], const uint32_t m[2],
uint32_t bnum)
{
__m128i inp[2];
if (bnum == sizeof(uint8_t)) {
inp[0] = _m256_mask_gather_epi8x4(p[0], m[0]);
inp[1] = _m256_mask_gather_epi8x4(p[1], m[1]);
} else {
inp[0] = _mm256_mmask_i64gather_epi32(
_mm256_castsi256_si128(zero),
m[0], p[0], NULL, sizeof(uint8_t));
inp[1] = _mm256_mmask_i64gather_epi32(
_mm256_castsi256_si128(zero),
m[1], p[1], NULL, sizeof(uint8_t));
}
/* squeeze input into one 256-bit register */
return _mm256_permutex2var_epi32(_mm256_castsi128_si256(inp[0]),
_SV_(pminp), _mm256_castsi128_si256(inp[1]));
}
#include "acl_run_avx512_common.h"
/*
* Perform search for up to (2 * 8) flows in parallel.
* Use two sets of metadata, each serves 8 flows max.
*/
static inline int
search_avx512x8x2(const struct rte_acl_ctx *ctx, const uint8_t **data,
uint32_t *results, uint32_t total_packets, uint32_t categories)
{
uint32_t i, *pm;
const struct rte_acl_match_results *pr;
struct acl_flow_avx512 flow;
uint32_t match[ctx->num_tries * total_packets];
for (i = 0, pm = match; i != ctx->num_tries; i++, pm += total_packets) {
/* setup for next trie */
acl_set_flow_avx512(&flow, ctx, i, data, pm, total_packets);
/* process the trie */
_F_(search_trie)(&flow);
}
/* resolve matches */
pr = (const struct rte_acl_match_results *)
(ctx->trans_table + ctx->match_index);
if (categories == 1)
_F_(resolve_single_cat)(results, pr, match, total_packets,
ctx->num_tries);
else
resolve_mcle8_avx512x1(results, pr, match, total_packets,
categories, ctx->num_tries);
return 0;
}
#undef _SIMD_PTR_MSK_
#undef _SIMD_PTR_NUM_
#undef _SIMD_FLOW_MSK_
#undef _SIMD_FLOW_NUM_
#undef _SIMD_MASK_MAX_
#undef _SIMD_MASK_BIT_
#undef _M_GI_
#undef _M_MGI_
#undef _M_SI_
#undef _M_I_
#undef _F_
#undef _SV_
#undef _SC_
#undef _T_mask
#undef _T_simd