From 92da9f1085f71297fc63df66b3a97b819391331e Mon Sep 17 00:00:00 2001 From: Alfred Klomp Date: Sat, 13 Jan 2024 14:06:42 +0100 Subject: [PATCH 1/7] lib/env.h: add BASE64_FORCE_INLINE macro Add a macro to enforce inlining of functions in the hot loop, regardless of optimization level or compiler heuristics. --- lib/env.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/env.h b/lib/env.h index d489ba5..0837065 100644 --- a/lib/env.h +++ b/lib/env.h @@ -71,4 +71,14 @@ # define BASE64_FALLTHROUGH #endif +// Declare macros to ensure that functions that are intended to be inlined, are +// actually inlined, even when no optimization is applied. A lot of inner loop +// code is factored into separate functions for reasons of readability, but +// that code should always be inlined (and optimized) in the main loop. +#ifdef _MSC_VER +# define BASE64_FORCE_INLINE __forceinline +#else +# define BASE64_FORCE_INLINE inline __attribute__((always_inline)) +#endif + #endif // BASE64_ENV_H From c88662a8ee3ab3622af768e99f20107dcbf1fc03 Mon Sep 17 00:00:00 2001 From: Alfred Klomp Date: Sat, 13 Jan 2024 14:18:12 +0100 Subject: [PATCH 2/7] avx2: apply BASE64_FORCE_INLINE macro --- lib/arch/avx2/dec_loop.c | 2 +- lib/arch/avx2/dec_reshuffle.c | 2 +- lib/arch/avx2/enc_loop.c | 4 ++-- lib/arch/avx2/enc_reshuffle.c | 2 +- lib/arch/avx2/enc_translate.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/arch/avx2/dec_loop.c b/lib/arch/avx2/dec_loop.c index f959fc4..b8a4cca 100644 --- a/lib/arch/avx2/dec_loop.c +++ b/lib/arch/avx2/dec_loop.c @@ -1,4 +1,4 @@ -static inline int +static BASE64_FORCE_INLINE int dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds) { const __m256i lut_lo = _mm256_setr_epi8( diff --git a/lib/arch/avx2/dec_reshuffle.c b/lib/arch/avx2/dec_reshuffle.c index f351809..bc875ce 100644 --- a/lib/arch/avx2/dec_reshuffle.c +++ b/lib/arch/avx2/dec_reshuffle.c @@ -1,4 +1,4 @@ -static inline __m256i +static BASE64_FORCE_INLINE __m256i dec_reshuffle (const __m256i in) { // in, lower lane, bits, upper case are most significant bits, lower diff --git a/lib/arch/avx2/enc_loop.c b/lib/arch/avx2/enc_loop.c index b9e2736..6f4aa0a 100644 --- a/lib/arch/avx2/enc_loop.c +++ b/lib/arch/avx2/enc_loop.c @@ -1,4 +1,4 @@ -static inline void +static BASE64_FORCE_INLINE void enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o) { // First load is done at s - 0 to not get a segfault: @@ -17,7 +17,7 @@ enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o) *o += 32; } -static inline void +static BASE64_FORCE_INLINE void enc_loop_avx2_inner (const uint8_t **s, uint8_t **o) { // Load input: diff --git a/lib/arch/avx2/enc_reshuffle.c b/lib/arch/avx2/enc_reshuffle.c index ba16690..82c659b 100644 --- a/lib/arch/avx2/enc_reshuffle.c +++ b/lib/arch/avx2/enc_reshuffle.c @@ -1,4 +1,4 @@ -static inline __m256i +static BASE64_FORCE_INLINE __m256i enc_reshuffle (const __m256i input) { // Translation of the SSSE3 reshuffling algorithm to AVX2. This one diff --git a/lib/arch/avx2/enc_translate.c b/lib/arch/avx2/enc_translate.c index 46173cd..370da98 100644 --- a/lib/arch/avx2/enc_translate.c +++ b/lib/arch/avx2/enc_translate.c @@ -1,4 +1,4 @@ -static inline __m256i +static BASE64_FORCE_INLINE __m256i enc_translate (const __m256i in) { // A lookup table containing the absolute offsets for all ranges: From 01c22e81861cab440b755c1137c05de2f095703c Mon Sep 17 00:00:00 2001 From: Alfred Klomp Date: Sat, 13 Jan 2024 14:26:18 +0100 Subject: [PATCH 3/7] avx512: apply BASE64_FORCE_INLINE macro --- lib/arch/avx512/enc_loop.c | 2 +- lib/arch/avx512/enc_reshuffle_translate.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/arch/avx512/enc_loop.c b/lib/arch/avx512/enc_loop.c index 4c71e16..cb44696 100644 --- a/lib/arch/avx512/enc_loop.c +++ b/lib/arch/avx512/enc_loop.c @@ -1,4 +1,4 @@ -static inline void +static BASE64_FORCE_INLINE void enc_loop_avx512_inner (const uint8_t **s, uint8_t **o) { // Load input. diff --git a/lib/arch/avx512/enc_reshuffle_translate.c b/lib/arch/avx512/enc_reshuffle_translate.c index 5c332bb..c6634f0 100644 --- a/lib/arch/avx512/enc_reshuffle_translate.c +++ b/lib/arch/avx512/enc_reshuffle_translate.c @@ -1,7 +1,7 @@ // AVX512 algorithm is based on permutevar and multishift. The code is based on // https://github.com/WojciechMula/base64simd which is under BSD-2 license. -static inline __m512i +static BASE64_FORCE_INLINE __m512i enc_reshuffle_translate (const __m512i input) { // 32-bit input From ca095e85044dc9544fd172dced0f988539f75ca6 Mon Sep 17 00:00:00 2001 From: Alfred Klomp Date: Sat, 13 Jan 2024 19:31:28 +0100 Subject: [PATCH 4/7] generic: apply BASE64_FORCE_INLINE macro --- lib/arch/generic/32/dec_loop.c | 2 +- lib/arch/generic/32/enc_loop.c | 2 +- lib/arch/generic/64/enc_loop.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/arch/generic/32/dec_loop.c b/lib/arch/generic/32/dec_loop.c index 8a8260f..aa290d7 100644 --- a/lib/arch/generic/32/dec_loop.c +++ b/lib/arch/generic/32/dec_loop.c @@ -1,4 +1,4 @@ -static inline int +static BASE64_FORCE_INLINE int dec_loop_generic_32_inner (const uint8_t **s, uint8_t **o, size_t *rounds) { const uint32_t str diff --git a/lib/arch/generic/32/enc_loop.c b/lib/arch/generic/32/enc_loop.c index f4870a7..b5e6eef 100644 --- a/lib/arch/generic/32/enc_loop.c +++ b/lib/arch/generic/32/enc_loop.c @@ -1,4 +1,4 @@ -static inline void +static BASE64_FORCE_INLINE void enc_loop_generic_32_inner (const uint8_t **s, uint8_t **o) { uint32_t src; diff --git a/lib/arch/generic/64/enc_loop.c b/lib/arch/generic/64/enc_loop.c index 0840bc7..e6a29cd 100644 --- a/lib/arch/generic/64/enc_loop.c +++ b/lib/arch/generic/64/enc_loop.c @@ -1,4 +1,4 @@ -static inline void +static BASE64_FORCE_INLINE void enc_loop_generic_64_inner (const uint8_t **s, uint8_t **o) { uint64_t src; From a4e2252335d05557b84447a8f39a67b30a79954c Mon Sep 17 00:00:00 2001 From: Alfred Klomp Date: Sat, 13 Jan 2024 19:30:26 +0100 Subject: [PATCH 5/7] neon32: apply BASE64_FORCE_INLINE macro --- lib/arch/neon32/codec.c | 2 +- lib/arch/neon32/dec_loop.c | 6 +++--- lib/arch/neon32/enc_loop.c | 4 ++-- lib/arch/neon32/enc_reshuffle.c | 2 +- lib/arch/neon32/enc_translate.c | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/lib/arch/neon32/codec.c b/lib/arch/neon32/codec.c index 70c80e4..d552344 100644 --- a/lib/arch/neon32/codec.c +++ b/lib/arch/neon32/codec.c @@ -22,7 +22,7 @@ #define BASE64_NEON32_USE_ASM #endif -static inline uint8x16_t +static BASE64_FORCE_INLINE uint8x16_t vqtbl1q_u8 (const uint8x16_t lut, const uint8x16_t indices) { // NEON32 only supports 64-bit wide lookups in 128-bit tables. Emulate diff --git a/lib/arch/neon32/dec_loop.c b/lib/arch/neon32/dec_loop.c index 2216b39..e4caed7 100644 --- a/lib/arch/neon32/dec_loop.c +++ b/lib/arch/neon32/dec_loop.c @@ -1,4 +1,4 @@ -static inline int +static BASE64_FORCE_INLINE int is_nonzero (const uint8x16_t v) { uint64_t u64; @@ -9,7 +9,7 @@ is_nonzero (const uint8x16_t v) return u64 != 0; } -static inline uint8x16_t +static BASE64_FORCE_INLINE uint8x16_t delta_lookup (const uint8x16_t v) { const uint8x8_t lut = { @@ -21,7 +21,7 @@ delta_lookup (const uint8x16_t v) vtbl1_u8(lut, vget_high_u8(v))); } -static inline uint8x16_t +static BASE64_FORCE_INLINE uint8x16_t dec_loop_neon32_lane (uint8x16_t *lane) { // See the SSSE3 decoder for an explanation of the algorithm. diff --git a/lib/arch/neon32/enc_loop.c b/lib/arch/neon32/enc_loop.c index d694b33..2adff48 100644 --- a/lib/arch/neon32/enc_loop.c +++ b/lib/arch/neon32/enc_loop.c @@ -1,5 +1,5 @@ #ifdef BASE64_NEON32_USE_ASM -static inline void +static BASE64_FORCE_INLINE void enc_loop_neon32_inner_asm (const uint8_t **s, uint8_t **o) { // This function duplicates the functionality of enc_loop_neon32_inner, @@ -106,7 +106,7 @@ enc_loop_neon32_inner_asm (const uint8_t **s, uint8_t **o) } #endif -static inline void +static BASE64_FORCE_INLINE void enc_loop_neon32_inner (const uint8_t **s, uint8_t **o) { #ifdef BASE64_NEON32_USE_ASM diff --git a/lib/arch/neon32/enc_reshuffle.c b/lib/arch/neon32/enc_reshuffle.c index d6e97cb..fa94d27 100644 --- a/lib/arch/neon32/enc_reshuffle.c +++ b/lib/arch/neon32/enc_reshuffle.c @@ -1,4 +1,4 @@ -static inline uint8x16x4_t +static BASE64_FORCE_INLINE uint8x16x4_t enc_reshuffle (uint8x16x3_t in) { uint8x16x4_t out; diff --git a/lib/arch/neon32/enc_translate.c b/lib/arch/neon32/enc_translate.c index e616d54..ff3d88d 100644 --- a/lib/arch/neon32/enc_translate.c +++ b/lib/arch/neon32/enc_translate.c @@ -1,4 +1,4 @@ -static inline uint8x16x4_t +static BASE64_FORCE_INLINE uint8x16x4_t enc_translate (const uint8x16x4_t in) { // A lookup table containing the absolute offsets for all ranges: From caf6eb1531455e492e2996579d7dd22d88033207 Mon Sep 17 00:00:00 2001 From: Alfred Klomp Date: Sat, 13 Jan 2024 19:30:37 +0100 Subject: [PATCH 6/7] neon64: apply BASE64_FORCE_INLINE macro --- lib/arch/neon64/codec.c | 2 +- lib/arch/neon64/enc_loop.c | 2 +- lib/arch/neon64/enc_reshuffle.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/arch/neon64/codec.c b/lib/arch/neon64/codec.c index f5cda63..6b664b4 100644 --- a/lib/arch/neon64/codec.c +++ b/lib/arch/neon64/codec.c @@ -22,7 +22,7 @@ #define BASE64_NEON64_USE_ASM #endif -static inline uint8x16x4_t +static BASE64_FORCE_INLINE uint8x16x4_t load_64byte_table (const uint8_t *p) { #ifdef BASE64_NEON64_USE_ASM diff --git a/lib/arch/neon64/enc_loop.c b/lib/arch/neon64/enc_loop.c index 59a1c59..8bdd088 100644 --- a/lib/arch/neon64/enc_loop.c +++ b/lib/arch/neon64/enc_loop.c @@ -1,4 +1,4 @@ -static inline void +static BASE64_FORCE_INLINE void enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc) { // Load 48 bytes and deinterleave: diff --git a/lib/arch/neon64/enc_reshuffle.c b/lib/arch/neon64/enc_reshuffle.c index ea543e0..2655df1 100644 --- a/lib/arch/neon64/enc_reshuffle.c +++ b/lib/arch/neon64/enc_reshuffle.c @@ -1,4 +1,4 @@ -static inline uint8x16x4_t +static BASE64_FORCE_INLINE uint8x16x4_t enc_reshuffle (const uint8x16x3_t in) { uint8x16x4_t out; From 32e5eb6a5e701ac7b19fafffa7b14659596caf20 Mon Sep 17 00:00:00 2001 From: Alfred Klomp Date: Sat, 13 Jan 2024 19:31:00 +0100 Subject: [PATCH 7/7] ssse3: apply BASE64_FORCE_INLINE macro --- lib/arch/ssse3/dec_loop.c | 2 +- lib/arch/ssse3/dec_reshuffle.c | 2 +- lib/arch/ssse3/enc_loop.c | 2 +- lib/arch/ssse3/enc_reshuffle.c | 2 +- lib/arch/ssse3/enc_translate.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/arch/ssse3/dec_loop.c b/lib/arch/ssse3/dec_loop.c index 9da71ab..7ddb73b 100644 --- a/lib/arch/ssse3/dec_loop.c +++ b/lib/arch/ssse3/dec_loop.c @@ -65,7 +65,7 @@ // 1110 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 // 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 -static inline int +static BASE64_FORCE_INLINE int dec_loop_ssse3_inner (const uint8_t **s, uint8_t **o, size_t *rounds) { const __m128i lut_lo = _mm_setr_epi8( diff --git a/lib/arch/ssse3/dec_reshuffle.c b/lib/arch/ssse3/dec_reshuffle.c index fdf587f..d3dd395 100644 --- a/lib/arch/ssse3/dec_reshuffle.c +++ b/lib/arch/ssse3/dec_reshuffle.c @@ -1,4 +1,4 @@ -static inline __m128i +static BASE64_FORCE_INLINE __m128i dec_reshuffle (const __m128i in) { // in, bits, upper case are most significant bits, lower case are least significant bits diff --git a/lib/arch/ssse3/enc_loop.c b/lib/arch/ssse3/enc_loop.c index 6de652e..9b67b70 100644 --- a/lib/arch/ssse3/enc_loop.c +++ b/lib/arch/ssse3/enc_loop.c @@ -1,4 +1,4 @@ -static inline void +static BASE64_FORCE_INLINE void enc_loop_ssse3_inner (const uint8_t **s, uint8_t **o) { // Load input: diff --git a/lib/arch/ssse3/enc_reshuffle.c b/lib/arch/ssse3/enc_reshuffle.c index b738591..f9dc949 100644 --- a/lib/arch/ssse3/enc_reshuffle.c +++ b/lib/arch/ssse3/enc_reshuffle.c @@ -1,4 +1,4 @@ -static inline __m128i +static BASE64_FORCE_INLINE __m128i enc_reshuffle (__m128i in) { // Input, bytes MSB to LSB: diff --git a/lib/arch/ssse3/enc_translate.c b/lib/arch/ssse3/enc_translate.c index 04f288f..60d9a42 100644 --- a/lib/arch/ssse3/enc_translate.c +++ b/lib/arch/ssse3/enc_translate.c @@ -1,4 +1,4 @@ -static inline __m128i +static BASE64_FORCE_INLINE __m128i enc_translate (const __m128i in) { // A lookup table containing the absolute offsets for all ranges: