-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathencode.avx512vl.cpp
56 lines (40 loc) · 2.82 KB
/
encode.avx512vl.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
// This file is a copy of encode.avx512bw.cpp with necessary changes to
// use VPMULTISHIFTQB instruction
namespace base64 {
namespace avx512vl {
void encode(const uint8_t* input, size_t bytes, uint8_t* output) {
uint8_t* out = output;
static const char* lookup_tbl = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
// 32-bit input: [ 0 0 0 0 0 0 0 0|c1 c0 d5 d4 d3 d2 d1 d0|b3 b2 b1 b0 c5 c4 c3 c2|a5 a4 a3 a2 a1 a0 b5 b4]
// 2 1 0
// output order [1, 2, 0, 1], i.e.:
// [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]
// ^^^^^^^^^^^ ^^^^^ ----------------- ^^^^^^^^^^^^^^^^^ -----------------
// constants generated by script/permutexvar_parameters.py
const __m512i shuffle_input = _mm512_setr_epi32(
0x01020001, 0x04050304, 0x07080607, 0x0a0b090a,
0x0d0e0c0d, 0x10110f10, 0x13141213, 0x16171516,
0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122,
0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e);
const __m512i lookup = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(lookup_tbl));
for (size_t i = 0; i < bytes; i += 4 * 12) {
const __m512i v = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(input + i));
// reorder bytes
// [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]
const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v);
// after multishift a single 32-bit lane has following layout:
// [c1 c0 d5 d4 d3 d2 d1 d0|b1 b0 c5 c4 c3 c2 c1 c0|a1 a0 b5 b4 b3 b2 b1 b0|d1 d0 a5 a4 a3 a2 a1 a0]
// ^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^
// i.e.: (a = [10:17], b = [4:11], c = [22:27], d = [16:21])
const __m512i shifts = _mm512_set1_epi64(0x3036242a1016040alu); // 48, 54, 36, 42, 16, 22, 4, 10
const __m512i indices = _mm512_multishift_epi64_epi8(shifts, in);
// Note: the two higher bits of each indices' byte have garbage,
// but the following permutexvar instruction masks them out.
// translation 6-bit values -> ASCII
const __m512i result = _mm512_permutexvar_epi8(indices, lookup);
_mm512_storeu_si512(reinterpret_cast<__m512i*>(out), result);
out += 64;
}
}
} // namespace avx512
} // namespace base64